diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 11e869aebe7da..db963e478326d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22534,6 +22534,56 @@ SDValue DAGCombiner::visitATOMIC_STORE(SDNode *N) { return SDValue(); } +static SDValue foldToMaskedStore(StoreSDNode *Store, SelectionDAG &DAG, + const SDLoc &Dl) { + if (!Store->isSimple() || !ISD::isNormalStore(Store)) + return SDValue(); + + SDValue StoredVal = Store->getValue(); + SDValue StorePtr = Store->getBasePtr(); + SDValue StoreOffset = Store->getOffset(); + EVT VT = Store->getMemoryVT(); + unsigned AddrSpace = Store->getAddressSpace(); + Align Alignment = Store->getAlign(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (!TLI.isOperationLegalOrCustom(ISD::MSTORE, VT) || + !TLI.allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment)) + return SDValue(); + + SDValue Mask, OtherVec, LoadCh; + unsigned LoadPos; + if (sd_match(StoredVal, + m_VSelect(m_Value(Mask), m_Value(OtherVec), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset))))) { + LoadPos = 2; + } else if (sd_match(StoredVal, + m_VSelect(m_Value(Mask), + m_Load(m_Value(LoadCh), m_Specific(StorePtr), + m_Specific(StoreOffset)), + m_Value(OtherVec)))) { + LoadPos = 1; + } else { + return SDValue(); + } + + auto *Load = cast(StoredVal.getOperand(LoadPos)); + if (!Load->isSimple() || !ISD::isNormalLoad(Load) || + Load->getAddressSpace() != AddrSpace) + return SDValue(); + + if (!Store->getChain().reachesChainWithoutSideEffects(LoadCh)) + return SDValue(); + + if (LoadPos == 1) + Mask = DAG.getNOT(Dl, Mask, Mask.getValueType()); + + return DAG.getMaskedStore(Store->getChain(), Dl, OtherVec, StorePtr, + StoreOffset, Mask, VT, Store->getMemOperand(), + Store->getAddressingMode()); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); @@ -22768,6 +22818,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (SDValue NewSt = splitMergedValStore(ST)) return NewSt; + if (SDValue MaskedStore = foldToMaskedStore(ST, DAG, SDLoc(N))) + return MaskedStore; + return ReduceLoadOpStoreWidth(N); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3c91b0eb4e2ea..acc7fd20fd758 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -905,6 +905,8 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); setOperationAction(ISD::RESET_FPENV, VT, Expand); + + setOperationAction(ISD::MSTORE, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll new file mode 100644 index 0000000000000..c2e54d3d39394 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll @@ -0,0 +1,1193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-- -mattr=+sve | FileCheck %s -check-prefix=SVE + +define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4i8: +; SVE: // %bb.0: +; SVE-NEXT: shl v1.4h, v1.4h, #15 +; SVE-NEXT: ldr s2, [x0] +; SVE-NEXT: zip1 v2.8b, v2.8b, v2.8b +; SVE-NEXT: cmlt v1.4h, v1.4h, #0 +; SVE-NEXT: bif v0.8b, v2.8b, v1.8b +; SVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; SVE-NEXT: str s0, [x0] +; SVE-NEXT: ret + %load = load <4 x i8>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i8> %x, <4 x i8> %load + store <4 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4i16: +; SVE: // %bb.0: +; SVE-NEXT: shl v1.4h, v1.4h, #15 +; SVE-NEXT: ptrue p0.h, vl4 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cmlt v1.4h, v1.4h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x i16>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load + store <4 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4i32: +; SVE: // %bb.0: +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: mov x8, #2 // =0x2 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; SVE-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-NEXT: shl v3.2d, v3.2d, #63 +; SVE-NEXT: shl v2.2d, v2.2d, #63 +; SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; SVE-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; SVE-NEXT: st1d { z0.d }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x i64>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4f16: +; SVE: // %bb.0: +; SVE-NEXT: shl v1.4h, v1.4h, #15 +; SVE-NEXT: ptrue p0.h, vl4 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cmlt v1.4h, v1.4h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x half>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load + store <4 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4f32: +; SVE: // %bb.0: +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x float>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load + store <4 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v4f64: +; SVE: // %bb.0: +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: mov x8, #2 // =0x2 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; SVE-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-NEXT: shl v3.2d, v3.2d, #63 +; SVE-NEXT: shl v2.2d, v2.2d, #63 +; SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; SVE-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; SVE-NEXT: st1d { z0.d }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x double>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load + store <4 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8i8: +; SVE: // %bb.0: +; SVE-NEXT: shl v1.8b, v1.8b, #7 +; SVE-NEXT: ptrue p0.b, vl8 +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: cmlt v1.8b, v1.8b, #0 +; SVE-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; SVE-NEXT: st1b { z0.b }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i8>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load + store <8 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8i16: +; SVE: // %bb.0: +; SVE-NEXT: ushll v1.8h, v1.8b, #0 +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v1.8h, v1.8h, #15 +; SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i16>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load + store <8 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d4 killed $d4 def $q4 +; SVE-NEXT: mov b5, v4.b[4] +; SVE-NEXT: mov b6, v4.b[6] +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: mov b7, v4.b[2] +; SVE-NEXT: mov b16, v4.b[0] +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: mov x9, #6 // =0x6 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: mov v5.b[4], v4.b[5] +; SVE-NEXT: mov v6.b[4], v4.b[7] +; SVE-NEXT: mov v7.b[4], v4.b[3] +; SVE-NEXT: mov v16.b[4], v4.b[1] +; SVE-NEXT: ushll v4.2d, v5.2s, #0 +; SVE-NEXT: ushll v5.2d, v6.2s, #0 +; SVE-NEXT: ushll v6.2d, v7.2s, #0 +; SVE-NEXT: ushll v7.2d, v16.2s, #0 +; SVE-NEXT: shl v4.2d, v4.2d, #63 +; SVE-NEXT: shl v5.2d, v5.2d, #63 +; SVE-NEXT: shl v6.2d, v6.2d, #63 +; SVE-NEXT: shl v7.2d, v7.2d, #63 +; SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; SVE-NEXT: cmlt v6.2d, v6.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; SVE-NEXT: cmlt v4.2d, v7.2d, #0 +; SVE-NEXT: cmpne p2.d, p0/z, z5.d, #0 +; SVE-NEXT: cmpne p3.d, p0/z, z6.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; SVE-NEXT: st1d { z2.d }, p1, [x0, x8, lsl #3] +; SVE-NEXT: mov x8, #2 // =0x2 +; SVE-NEXT: st1d { z3.d }, p2, [x0, x9, lsl #3] +; SVE-NEXT: st1d { z1.d }, p3, [x0, x8, lsl #3] +; SVE-NEXT: st1d { z0.d }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i64>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8f16: +; SVE: // %bb.0: +; SVE-NEXT: ushll v1.8h, v1.8b, #0 +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v1.8h, v1.8h, #15 +; SVE-NEXT: cmlt v1.8h, v1.8h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x half>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load + store <8 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8f32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x float>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load + store <8 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v8f64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d4 killed $d4 def $q4 +; SVE-NEXT: mov b5, v4.b[4] +; SVE-NEXT: mov b6, v4.b[6] +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: mov b7, v4.b[2] +; SVE-NEXT: mov b16, v4.b[0] +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: mov x9, #6 // =0x6 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: mov v5.b[4], v4.b[5] +; SVE-NEXT: mov v6.b[4], v4.b[7] +; SVE-NEXT: mov v7.b[4], v4.b[3] +; SVE-NEXT: mov v16.b[4], v4.b[1] +; SVE-NEXT: ushll v4.2d, v5.2s, #0 +; SVE-NEXT: ushll v5.2d, v6.2s, #0 +; SVE-NEXT: ushll v6.2d, v7.2s, #0 +; SVE-NEXT: ushll v7.2d, v16.2s, #0 +; SVE-NEXT: shl v4.2d, v4.2d, #63 +; SVE-NEXT: shl v5.2d, v5.2d, #63 +; SVE-NEXT: shl v6.2d, v6.2d, #63 +; SVE-NEXT: shl v7.2d, v7.2d, #63 +; SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; SVE-NEXT: cmlt v6.2d, v6.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; SVE-NEXT: cmlt v4.2d, v7.2d, #0 +; SVE-NEXT: cmpne p2.d, p0/z, z5.d, #0 +; SVE-NEXT: cmpne p3.d, p0/z, z6.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; SVE-NEXT: st1d { z2.d }, p1, [x0, x8, lsl #3] +; SVE-NEXT: mov x8, #2 // =0x2 +; SVE-NEXT: st1d { z3.d }, p2, [x0, x9, lsl #3] +; SVE-NEXT: st1d { z1.d }, p3, [x0, x8, lsl #3] +; SVE-NEXT: st1d { z0.d }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x double>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load + store <8 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v16i8: +; SVE: // %bb.0: +; SVE-NEXT: shl v1.16b, v1.16b, #7 +; SVE-NEXT: ptrue p0.b, vl16 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: cmlt v1.16b, v1.16b, #0 +; SVE-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; SVE-NEXT: st1b { z0.b }, p0, [x0] +; SVE-NEXT: ret + %load = load <16 x i8>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load + store <16 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v16i16: +; SVE: // %bb.0: +; SVE-NEXT: ushll2 v3.8h, v2.16b, #0 +; SVE-NEXT: ushll v2.8h, v2.8b, #0 +; SVE-NEXT: mov x8, #8 // =0x8 +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v3.8h, v3.8h, #15 +; SVE-NEXT: shl v2.8h, v2.8h, #15 +; SVE-NEXT: cmlt v3.8h, v3.8h, #0 +; SVE-NEXT: cmlt v2.8h, v2.8h, #0 +; SVE-NEXT: cmpne p1.h, p0/z, z3.h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z2.h, #0 +; SVE-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <16 x i16>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %load + store <16 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: zip1 v4.8b, v4.8b, v0.8b +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: mov x9, #8 // =0x8 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: zip1 v7.8b, v5.8b, v0.8b +; SVE-NEXT: zip2 v5.8b, v5.8b, v0.8b +; SVE-NEXT: ushll v6.4s, v6.4h, #0 +; SVE-NEXT: ushll v4.4s, v4.4h, #0 +; SVE-NEXT: shl v6.4s, v6.4s, #31 +; SVE-NEXT: ushll v7.4s, v7.4h, #0 +; SVE-NEXT: ushll v5.4s, v5.4h, #0 +; SVE-NEXT: shl v4.4s, v4.4s, #31 +; SVE-NEXT: cmlt v6.4s, v6.4s, #0 +; SVE-NEXT: shl v7.4s, v7.4s, #31 +; SVE-NEXT: shl v5.4s, v5.4s, #31 +; SVE-NEXT: cmlt v4.4s, v4.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z6.s, #0 +; SVE-NEXT: cmlt v7.4s, v7.4s, #0 +; SVE-NEXT: cmlt v5.4s, v5.4s, #0 +; SVE-NEXT: cmpne p2.s, p0/z, z7.s, #0 +; SVE-NEXT: cmpne p3.s, p0/z, z5.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: mov x8, #12 // =0xc +; SVE-NEXT: st1w { z2.s }, p2, [x0, x9, lsl #2] +; SVE-NEXT: st1w { z3.s }, p3, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %load + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v32i8: +; SVE: // %bb.0: +; SVE-NEXT: ldr w8, [sp, #72] +; SVE-NEXT: fmov s2, w1 +; SVE-NEXT: ldr w9, [sp, #80] +; SVE-NEXT: ptrue p0.b, vl16 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: fmov s3, w8 +; SVE-NEXT: ldr w8, [sp, #88] +; SVE-NEXT: mov v2.b[1], w2 +; SVE-NEXT: mov v3.b[1], w9 +; SVE-NEXT: ldr w9, [sp] +; SVE-NEXT: mov v2.b[2], w3 +; SVE-NEXT: mov v3.b[2], w8 +; SVE-NEXT: ldr w8, [sp, #96] +; SVE-NEXT: mov v2.b[3], w4 +; SVE-NEXT: mov v3.b[3], w8 +; SVE-NEXT: ldr w8, [sp, #104] +; SVE-NEXT: mov v2.b[4], w5 +; SVE-NEXT: mov v3.b[4], w8 +; SVE-NEXT: ldr w8, [sp, #112] +; SVE-NEXT: mov v2.b[5], w6 +; SVE-NEXT: mov v3.b[5], w8 +; SVE-NEXT: ldr w8, [sp, #120] +; SVE-NEXT: mov v2.b[6], w7 +; SVE-NEXT: mov v3.b[6], w8 +; SVE-NEXT: ldr w8, [sp, #128] +; SVE-NEXT: mov v2.b[7], w9 +; SVE-NEXT: ldr w9, [sp, #8] +; SVE-NEXT: mov v3.b[7], w8 +; SVE-NEXT: ldr w8, [sp, #136] +; SVE-NEXT: mov v2.b[8], w9 +; SVE-NEXT: ldr w9, [sp, #16] +; SVE-NEXT: mov v3.b[8], w8 +; SVE-NEXT: ldr w8, [sp, #144] +; SVE-NEXT: mov v2.b[9], w9 +; SVE-NEXT: ldr w9, [sp, #24] +; SVE-NEXT: mov v3.b[9], w8 +; SVE-NEXT: ldr w8, [sp, #152] +; SVE-NEXT: mov v2.b[10], w9 +; SVE-NEXT: ldr w9, [sp, #32] +; SVE-NEXT: mov v3.b[10], w8 +; SVE-NEXT: ldr w8, [sp, #160] +; SVE-NEXT: mov v2.b[11], w9 +; SVE-NEXT: ldr w9, [sp, #40] +; SVE-NEXT: mov v3.b[11], w8 +; SVE-NEXT: ldr w8, [sp, #168] +; SVE-NEXT: mov v2.b[12], w9 +; SVE-NEXT: ldr w9, [sp, #48] +; SVE-NEXT: mov v3.b[12], w8 +; SVE-NEXT: ldr w8, [sp, #176] +; SVE-NEXT: mov v2.b[13], w9 +; SVE-NEXT: ldr w9, [sp, #56] +; SVE-NEXT: mov v3.b[13], w8 +; SVE-NEXT: ldr w8, [sp, #184] +; SVE-NEXT: mov v2.b[14], w9 +; SVE-NEXT: ldr w9, [sp, #64] +; SVE-NEXT: mov v3.b[14], w8 +; SVE-NEXT: ldr w8, [sp, #192] +; SVE-NEXT: mov v2.b[15], w9 +; SVE-NEXT: mov v3.b[15], w8 +; SVE-NEXT: mov w8, #16 // =0x10 +; SVE-NEXT: shl v2.16b, v2.16b, #7 +; SVE-NEXT: shl v3.16b, v3.16b, #7 +; SVE-NEXT: cmlt v2.16b, v2.16b, #0 +; SVE-NEXT: cmlt v3.16b, v3.16b, #0 +; SVE-NEXT: cmpne p1.b, p0/z, z3.b, #0 +; SVE-NEXT: cmpne p0.b, p0/z, z2.b, #0 +; SVE-NEXT: st1b { z1.b }, p1, [x0, x8] +; SVE-NEXT: st1b { z0.b }, p0, [x0] +; SVE-NEXT: ret + %load = load <32 x i8>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %load + store <32 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v32i16: +; SVE: // %bb.0: +; SVE-NEXT: ldr w9, [sp, #72] +; SVE-NEXT: ldr w11, [sp, #136] +; SVE-NEXT: fmov s7, w1 +; SVE-NEXT: ldr w8, [sp, #80] +; SVE-NEXT: ldr w10, [sp, #144] +; SVE-NEXT: ptrue p0.h, vl8 +; SVE-NEXT: fmov s4, w9 +; SVE-NEXT: ldr w9, [sp, #8] +; SVE-NEXT: fmov s5, w11 +; SVE-NEXT: mov v7.b[1], w2 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: fmov s6, w9 +; SVE-NEXT: ldr w9, [sp, #152] +; SVE-NEXT: mov v4.b[1], w8 +; SVE-NEXT: ldr w8, [sp, #16] +; SVE-NEXT: mov v5.b[1], w10 +; SVE-NEXT: mov v6.b[1], w8 +; SVE-NEXT: ldr w8, [sp, #88] +; SVE-NEXT: mov v7.b[2], w3 +; SVE-NEXT: mov v4.b[2], w8 +; SVE-NEXT: ldr w8, [sp, #24] +; SVE-NEXT: mov v5.b[2], w9 +; SVE-NEXT: ldr w9, [sp, #160] +; SVE-NEXT: mov v6.b[2], w8 +; SVE-NEXT: ldr w8, [sp, #96] +; SVE-NEXT: mov v7.b[3], w4 +; SVE-NEXT: mov v4.b[3], w8 +; SVE-NEXT: ldr w8, [sp, #32] +; SVE-NEXT: mov v5.b[3], w9 +; SVE-NEXT: ldr w9, [sp, #168] +; SVE-NEXT: mov v6.b[3], w8 +; SVE-NEXT: ldr w8, [sp, #104] +; SVE-NEXT: mov v7.b[4], w5 +; SVE-NEXT: mov v4.b[4], w8 +; SVE-NEXT: ldr w8, [sp, #40] +; SVE-NEXT: mov v5.b[4], w9 +; SVE-NEXT: ldr w9, [sp, #176] +; SVE-NEXT: mov v6.b[4], w8 +; SVE-NEXT: ldr w8, [sp, #112] +; SVE-NEXT: mov v7.b[5], w6 +; SVE-NEXT: mov v4.b[5], w8 +; SVE-NEXT: ldr w8, [sp, #48] +; SVE-NEXT: mov v5.b[5], w9 +; SVE-NEXT: ldr w9, [sp, #184] +; SVE-NEXT: mov v6.b[5], w8 +; SVE-NEXT: ldr w8, [sp, #120] +; SVE-NEXT: mov v7.b[6], w7 +; SVE-NEXT: mov v4.b[6], w8 +; SVE-NEXT: ldr w8, [sp, #56] +; SVE-NEXT: mov v5.b[6], w9 +; SVE-NEXT: ldr w9, [sp, #192] +; SVE-NEXT: mov v6.b[6], w8 +; SVE-NEXT: ldr w8, [sp, #128] +; SVE-NEXT: mov v4.b[7], w8 +; SVE-NEXT: ldr w8, [sp, #64] +; SVE-NEXT: mov v5.b[7], w9 +; SVE-NEXT: ldr w9, [sp] +; SVE-NEXT: mov v6.b[7], w8 +; SVE-NEXT: mov x8, #16 // =0x10 +; SVE-NEXT: mov v7.b[7], w9 +; SVE-NEXT: ushll v4.8h, v4.8b, #0 +; SVE-NEXT: ushll v5.8h, v5.8b, #0 +; SVE-NEXT: ushll v6.8h, v6.8b, #0 +; SVE-NEXT: ushll v7.8h, v7.8b, #0 +; SVE-NEXT: shl v4.8h, v4.8h, #15 +; SVE-NEXT: shl v5.8h, v5.8h, #15 +; SVE-NEXT: shl v6.8h, v6.8h, #15 +; SVE-NEXT: shl v7.8h, v7.8h, #15 +; SVE-NEXT: cmlt v4.8h, v4.8h, #0 +; SVE-NEXT: cmlt v5.8h, v5.8h, #0 +; SVE-NEXT: cmlt v6.8h, v6.8h, #0 +; SVE-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; SVE-NEXT: cmlt v4.8h, v7.8h, #0 +; SVE-NEXT: cmpne p2.h, p0/z, z5.h, #0 +; SVE-NEXT: cmpne p3.h, p0/z, z6.h, #0 +; SVE-NEXT: cmpne p0.h, p0/z, z4.h, #0 +; SVE-NEXT: st1h { z2.h }, p1, [x0, x8, lsl #1] +; SVE-NEXT: mov x8, #24 // =0x18 +; SVE-NEXT: st1h { z3.h }, p2, [x0, x8, lsl #1] +; SVE-NEXT: mov x8, #8 // =0x8 +; SVE-NEXT: st1h { z1.h }, p3, [x0, x8, lsl #1] +; SVE-NEXT: st1h { z0.h }, p0, [x0] +; SVE-NEXT: ret + %load = load <32 x i16>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %load + store <32 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_v64i8: +; SVE: // %bb.0: +; SVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; SVE-NEXT: .cfi_def_cfa_offset 16 +; SVE-NEXT: .cfi_offset w29, -16 +; SVE-NEXT: ldr w8, [sp, #216] +; SVE-NEXT: ldr w9, [sp, #344] +; SVE-NEXT: fmov s7, w1 +; SVE-NEXT: ldr w11, [sp, #88] +; SVE-NEXT: ldr w10, [sp, #224] +; SVE-NEXT: ptrue p0.b, vl16 +; SVE-NEXT: fmov s4, w8 +; SVE-NEXT: fmov s5, w9 +; SVE-NEXT: ldr w8, [sp, #352] +; SVE-NEXT: fmov s6, w11 +; SVE-NEXT: ldr w9, [sp, #96] +; SVE-NEXT: mov v7.b[1], w2 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: mov v4.b[1], w10 +; SVE-NEXT: mov v5.b[1], w8 +; SVE-NEXT: ldr w8, [sp, #232] +; SVE-NEXT: mov v6.b[1], w9 +; SVE-NEXT: ldr w9, [sp, #360] +; SVE-NEXT: ldr w10, [sp, #112] +; SVE-NEXT: mov v7.b[2], w3 +; SVE-NEXT: mov v4.b[2], w8 +; SVE-NEXT: ldr w8, [sp, #104] +; SVE-NEXT: mov v5.b[2], w9 +; SVE-NEXT: ldr w9, [sp, #368] +; SVE-NEXT: mov v6.b[2], w8 +; SVE-NEXT: ldr w8, [sp, #240] +; SVE-NEXT: mov v7.b[3], w4 +; SVE-NEXT: mov v4.b[3], w8 +; SVE-NEXT: mov v5.b[3], w9 +; SVE-NEXT: ldr w8, [sp, #248] +; SVE-NEXT: ldr w9, [sp, #376] +; SVE-NEXT: mov v6.b[3], w10 +; SVE-NEXT: ldr w10, [sp, #120] +; SVE-NEXT: mov v7.b[4], w5 +; SVE-NEXT: mov v4.b[4], w8 +; SVE-NEXT: mov v5.b[4], w9 +; SVE-NEXT: ldr w8, [sp, #256] +; SVE-NEXT: ldr w9, [sp, #384] +; SVE-NEXT: mov v6.b[4], w10 +; SVE-NEXT: ldr w10, [sp, #128] +; SVE-NEXT: mov v7.b[5], w6 +; SVE-NEXT: mov v4.b[5], w8 +; SVE-NEXT: mov v5.b[5], w9 +; SVE-NEXT: ldr w8, [sp, #264] +; SVE-NEXT: ldr w9, [sp, #392] +; SVE-NEXT: mov v6.b[5], w10 +; SVE-NEXT: ldr w10, [sp, #136] +; SVE-NEXT: mov v7.b[6], w7 +; SVE-NEXT: mov v4.b[6], w8 +; SVE-NEXT: mov v5.b[6], w9 +; SVE-NEXT: ldr w8, [sp, #272] +; SVE-NEXT: ldr w9, [sp, #400] +; SVE-NEXT: mov v6.b[6], w10 +; SVE-NEXT: ldr w10, [sp, #144] +; SVE-NEXT: mov v4.b[7], w8 +; SVE-NEXT: ldr w8, [sp, #16] +; SVE-NEXT: mov v5.b[7], w9 +; SVE-NEXT: ldr w9, [sp, #280] +; SVE-NEXT: mov v6.b[7], w10 +; SVE-NEXT: mov v7.b[7], w8 +; SVE-NEXT: ldr w10, [sp, #408] +; SVE-NEXT: ldr w8, [sp, #152] +; SVE-NEXT: mov v4.b[8], w9 +; SVE-NEXT: ldr w9, [sp, #24] +; SVE-NEXT: mov v5.b[8], w10 +; SVE-NEXT: ldr w10, [sp, #288] +; SVE-NEXT: mov v6.b[8], w8 +; SVE-NEXT: mov v7.b[8], w9 +; SVE-NEXT: ldr w8, [sp, #416] +; SVE-NEXT: ldr w9, [sp, #160] +; SVE-NEXT: mov v4.b[9], w10 +; SVE-NEXT: ldr w10, [sp, #32] +; SVE-NEXT: mov v5.b[9], w8 +; SVE-NEXT: ldr w8, [sp, #296] +; SVE-NEXT: mov v6.b[9], w9 +; SVE-NEXT: mov v7.b[9], w10 +; SVE-NEXT: ldr w9, [sp, #424] +; SVE-NEXT: ldr w10, [sp, #168] +; SVE-NEXT: mov v4.b[10], w8 +; SVE-NEXT: ldr w8, [sp, #40] +; SVE-NEXT: mov v5.b[10], w9 +; SVE-NEXT: ldr w9, [sp, #304] +; SVE-NEXT: mov v6.b[10], w10 +; SVE-NEXT: mov v7.b[10], w8 +; SVE-NEXT: ldr w10, [sp, #432] +; SVE-NEXT: ldr w8, [sp, #176] +; SVE-NEXT: mov v4.b[11], w9 +; SVE-NEXT: ldr w9, [sp, #48] +; SVE-NEXT: mov v5.b[11], w10 +; SVE-NEXT: ldr w10, [sp, #312] +; SVE-NEXT: mov v6.b[11], w8 +; SVE-NEXT: mov v7.b[11], w9 +; SVE-NEXT: ldr w8, [sp, #440] +; SVE-NEXT: ldr w9, [sp, #184] +; SVE-NEXT: mov v4.b[12], w10 +; SVE-NEXT: ldr w10, [sp, #56] +; SVE-NEXT: mov v5.b[12], w8 +; SVE-NEXT: ldr w8, [sp, #320] +; SVE-NEXT: mov v6.b[12], w9 +; SVE-NEXT: mov v7.b[12], w10 +; SVE-NEXT: ldr w9, [sp, #448] +; SVE-NEXT: ldr w10, [sp, #192] +; SVE-NEXT: mov v4.b[13], w8 +; SVE-NEXT: ldr w8, [sp, #64] +; SVE-NEXT: mov v5.b[13], w9 +; SVE-NEXT: ldr w9, [sp, #328] +; SVE-NEXT: mov v6.b[13], w10 +; SVE-NEXT: mov v7.b[13], w8 +; SVE-NEXT: ldr w10, [sp, #456] +; SVE-NEXT: ldr w8, [sp, #200] +; SVE-NEXT: mov v4.b[14], w9 +; SVE-NEXT: ldr w9, [sp, #72] +; SVE-NEXT: mov v5.b[14], w10 +; SVE-NEXT: ldr w10, [sp, #336] +; SVE-NEXT: mov v6.b[14], w8 +; SVE-NEXT: mov v7.b[14], w9 +; SVE-NEXT: ldr w8, [sp, #464] +; SVE-NEXT: ldr w9, [sp, #208] +; SVE-NEXT: mov v4.b[15], w10 +; SVE-NEXT: ldr w10, [sp, #80] +; SVE-NEXT: mov v5.b[15], w8 +; SVE-NEXT: mov w8, #32 // =0x20 +; SVE-NEXT: mov v6.b[15], w9 +; SVE-NEXT: mov v7.b[15], w10 +; SVE-NEXT: mov w9, #48 // =0x30 +; SVE-NEXT: shl v4.16b, v4.16b, #7 +; SVE-NEXT: shl v5.16b, v5.16b, #7 +; SVE-NEXT: shl v6.16b, v6.16b, #7 +; SVE-NEXT: shl v7.16b, v7.16b, #7 +; SVE-NEXT: cmlt v4.16b, v4.16b, #0 +; SVE-NEXT: cmlt v5.16b, v5.16b, #0 +; SVE-NEXT: cmlt v6.16b, v6.16b, #0 +; SVE-NEXT: cmpne p1.b, p0/z, z4.b, #0 +; SVE-NEXT: cmlt v4.16b, v7.16b, #0 +; SVE-NEXT: cmpne p2.b, p0/z, z5.b, #0 +; SVE-NEXT: cmpne p3.b, p0/z, z6.b, #0 +; SVE-NEXT: cmpne p0.b, p0/z, z4.b, #0 +; SVE-NEXT: st1b { z2.b }, p1, [x0, x8] +; SVE-NEXT: mov w8, #16 // =0x10 +; SVE-NEXT: st1b { z3.b }, p2, [x0, x9] +; SVE-NEXT: st1b { z1.b }, p3, [x0, x8] +; SVE-NEXT: st1b { z0.b }, p0, [x0] +; SVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; SVE-NEXT: ret + %load = load <64 x i8>, ptr %ptr, align 32 + %sel = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %load + store <64 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_invert_mask_v4i32: +; SVE: // %bb.0: +; SVE-NEXT: movi v2.4h, #1 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: eor v1.8b, v1.8b, v2.8b +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> %x + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_invert_mask_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip1 v2.8b, v2.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmpge p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpge p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %load, <8 x i32> %x + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; SVE-LABEL: test_masked_store_success_invert_mask_v16i32: +; SVE: // %bb.0: +; SVE-NEXT: ext v5.16b, v4.16b, v4.16b, #8 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: zip1 v4.8b, v4.8b, v0.8b +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: zip1 v7.8b, v5.8b, v0.8b +; SVE-NEXT: zip2 v5.8b, v5.8b, v0.8b +; SVE-NEXT: ushll v6.4s, v6.4h, #0 +; SVE-NEXT: ushll v4.4s, v4.4h, #0 +; SVE-NEXT: shl v6.4s, v6.4s, #31 +; SVE-NEXT: ushll v7.4s, v7.4h, #0 +; SVE-NEXT: ushll v5.4s, v5.4h, #0 +; SVE-NEXT: shl v4.4s, v4.4s, #31 +; SVE-NEXT: cmpge p1.s, p0/z, z6.s, #0 +; SVE-NEXT: shl v7.4s, v7.4s, #31 +; SVE-NEXT: shl v5.4s, v5.4s, #31 +; SVE-NEXT: cmpge p2.s, p0/z, z7.s, #0 +; SVE-NEXT: cmpge p3.s, p0/z, z5.s, #0 +; SVE-NEXT: cmpge p0.s, p0/z, z4.s, #0 +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: mov x8, #8 // =0x8 +; SVE-NEXT: st1w { z2.s }, p2, [x0, x8, lsl #2] +; SVE-NEXT: mov x8, #12 // =0xc +; SVE-NEXT: st1w { z3.s }, p3, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: ret + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %load, <16 x i32> %x + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_zextload(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_zextload: +; SVE: // %bb.0: +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: ldr q4, [x0] +; SVE-NEXT: ushll2 v5.2d, v4.4s, #0 +; SVE-NEXT: ushll v4.2d, v4.2s, #0 +; SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; SVE-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-NEXT: shl v3.2d, v3.2d, #63 +; SVE-NEXT: shl v2.2d, v2.2d, #63 +; SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-NEXT: bif v1.16b, v5.16b, v3.16b +; SVE-NEXT: bif v0.16b, v4.16b, v2.16b +; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %zext = zext <4 x i32> %load to <4 x i64> + %masked = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %zext + store <4 x i64> %masked, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_volatile_load: +; SVE: // %bb.0: +; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; SVE-NEXT: ldr q4, [x0] +; SVE-NEXT: ldr q5, [x0, #16] +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: bif v0.16b, v4.16b, v3.16b +; SVE-NEXT: bif v1.16b, v5.16b, v2.16b +; SVE-NEXT: stp q0, q1, [x0] +; SVE-NEXT: ret + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_volatile_store: +; SVE: // %bb.0: +; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; SVE-NEXT: ldp q4, q5, [x0] +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: bif v0.16b, v4.16b, v3.16b +; SVE-NEXT: bif v1.16b, v5.16b, v2.16b +; SVE-NEXT: str q0, [x0] +; SVE-NEXT: str q1, [x0, #16] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) nounwind { +; SVE-LABEL: test_masked_store_intervening: +; SVE: // %bb.0: +; SVE-NEXT: sub sp, sp, #96 +; SVE-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill +; SVE-NEXT: ldp q1, q3, [x0] +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: str d8, [sp, #64] // 8-byte Folded Spill +; SVE-NEXT: fmov d8, d2 +; SVE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; SVE-NEXT: mov x19, x0 +; SVE-NEXT: stp q1, q3, [sp] // 32-byte Folded Spill +; SVE-NEXT: movi v1.2d, #0000000000000000 +; SVE-NEXT: stp q0, q0, [x0] +; SVE-NEXT: bl use_vec +; SVE-NEXT: zip2 v0.8b, v8.8b, v0.8b +; SVE-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload +; SVE-NEXT: zip1 v1.8b, v8.8b, v0.8b +; SVE-NEXT: ushll v0.4s, v0.4h, #0 +; SVE-NEXT: ldr d8, [sp, #64] // 8-byte Folded Reload +; SVE-NEXT: shl v0.4s, v0.4s, #31 +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: cmlt v0.4s, v0.4s, #0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: bsl v0.16b, v2.16b, v3.16b +; SVE-NEXT: ldr q2, [sp, #48] // 16-byte Folded Reload +; SVE-NEXT: ldr q3, [sp] // 16-byte Folded Reload +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; SVE-NEXT: stp q1, q0, [x19] +; SVE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; SVE-NEXT: add sp, sp, #96 +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; SVE-LABEL: test_masked_store_multiple_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip2 v6.8b, v4.8b, v0.8b +; SVE-NEXT: zip1 v4.8b, v4.8b, v0.8b +; SVE-NEXT: mov x8, #4 // =0x4 +; SVE-NEXT: zip1 v7.8b, v5.8b, v0.8b +; SVE-NEXT: zip2 v5.8b, v5.8b, v0.8b +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: ushll v6.4s, v6.4h, #0 +; SVE-NEXT: ushll v4.4s, v4.4h, #0 +; SVE-NEXT: ushll v7.4s, v7.4h, #0 +; SVE-NEXT: ushll v5.4s, v5.4h, #0 +; SVE-NEXT: shl v6.4s, v6.4s, #31 +; SVE-NEXT: shl v4.4s, v4.4s, #31 +; SVE-NEXT: shl v7.4s, v7.4s, #31 +; SVE-NEXT: shl v5.4s, v5.4s, #31 +; SVE-NEXT: cmlt v6.4s, v6.4s, #0 +; SVE-NEXT: cmlt v4.4s, v4.4s, #0 +; SVE-NEXT: cmlt v7.4s, v7.4s, #0 +; SVE-NEXT: cmlt v5.4s, v5.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z6.s, #0 +; SVE-NEXT: ldp q6, q16, [x1] +; SVE-NEXT: cmpne p0.s, p0/z, z4.s, #0 +; SVE-NEXT: bif v2.16b, v6.16b, v7.16b +; SVE-NEXT: bif v3.16b, v16.16b, v5.16b +; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; SVE-NEXT: st1w { z0.s }, p0, [x0] +; SVE-NEXT: stp q2, q3, [x1] +; SVE-NEXT: ret + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; SVE-LABEL: test_masked_store_multiple_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: ldp d16, d18, [sp] +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: mov x8, #6 // =0x6 +; SVE-NEXT: mov x9, #4 // =0x4 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: mov b17, v16.b[4] +; SVE-NEXT: mov b19, v16.b[2] +; SVE-NEXT: mov b20, v16.b[6] +; SVE-NEXT: mov b21, v16.b[0] +; SVE-NEXT: mov b22, v18.b[4] +; SVE-NEXT: mov b23, v18.b[6] +; SVE-NEXT: mov b24, v18.b[0] +; SVE-NEXT: mov b25, v18.b[2] +; SVE-NEXT: mov v17.b[4], v16.b[5] +; SVE-NEXT: mov v19.b[4], v16.b[3] +; SVE-NEXT: mov v20.b[4], v16.b[7] +; SVE-NEXT: mov v21.b[4], v16.b[1] +; SVE-NEXT: mov v22.b[4], v18.b[5] +; SVE-NEXT: mov v23.b[4], v18.b[7] +; SVE-NEXT: mov v24.b[4], v18.b[1] +; SVE-NEXT: mov v25.b[4], v18.b[3] +; SVE-NEXT: ushll v17.2d, v17.2s, #0 +; SVE-NEXT: ushll v18.2d, v21.2s, #0 +; SVE-NEXT: ushll v21.2d, v24.2s, #0 +; SVE-NEXT: shl v16.2d, v17.2d, #63 +; SVE-NEXT: ushll v17.2d, v19.2s, #0 +; SVE-NEXT: ushll v19.2d, v20.2s, #0 +; SVE-NEXT: ushll v20.2d, v22.2s, #0 +; SVE-NEXT: shl v18.2d, v18.2d, #63 +; SVE-NEXT: ushll v22.2d, v25.2s, #0 +; SVE-NEXT: shl v21.2d, v21.2d, #63 +; SVE-NEXT: cmlt v16.2d, v16.2d, #0 +; SVE-NEXT: shl v17.2d, v17.2d, #63 +; SVE-NEXT: shl v19.2d, v19.2d, #63 +; SVE-NEXT: shl v20.2d, v20.2d, #63 +; SVE-NEXT: cmlt v18.2d, v18.2d, #0 +; SVE-NEXT: shl v22.2d, v22.2d, #63 +; SVE-NEXT: cmlt v21.2d, v21.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z16.d, #0 +; SVE-NEXT: ushll v16.2d, v23.2s, #0 +; SVE-NEXT: cmlt v17.2d, v17.2d, #0 +; SVE-NEXT: cmlt v19.2d, v19.2d, #0 +; SVE-NEXT: cmlt v20.2d, v20.2d, #0 +; SVE-NEXT: shl v16.2d, v16.2d, #63 +; SVE-NEXT: cmpne p2.d, p0/z, z17.d, #0 +; SVE-NEXT: cmpne p3.d, p0/z, z19.d, #0 +; SVE-NEXT: ldp q17, q19, [x1, #32] +; SVE-NEXT: cmpne p0.d, p0/z, z18.d, #0 +; SVE-NEXT: cmlt v16.2d, v16.2d, #0 +; SVE-NEXT: bif v6.16b, v17.16b, v20.16b +; SVE-NEXT: cmlt v20.2d, v22.2d, #0 +; SVE-NEXT: ldp q17, q18, [x1] +; SVE-NEXT: st1d { z2.d }, p1, [x0, x9, lsl #3] +; SVE-NEXT: mov v2.16b, v16.16b +; SVE-NEXT: st1d { z3.d }, p3, [x0, x8, lsl #3] +; SVE-NEXT: mov v3.16b, v21.16b +; SVE-NEXT: st1d { z0.d }, p0, [x0] +; SVE-NEXT: mov v0.16b, v20.16b +; SVE-NEXT: mov x9, #2 // =0x2 +; SVE-NEXT: st1d { z1.d }, p2, [x0, x9, lsl #3] +; SVE-NEXT: bsl v2.16b, v7.16b, v19.16b +; SVE-NEXT: bsl v3.16b, v4.16b, v17.16b +; SVE-NEXT: bsl v0.16b, v5.16b, v18.16b +; SVE-NEXT: stp q6, q2, [x1, #32] +; SVE-NEXT: stp q3, q0, [x1] +; SVE-NEXT: ret + %load = load <8 x i64>, ptr %ptr1, align 32 + %load2 = load <8 x i64>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + %sel2 = select <8 x i1> %mask2, <8 x i64> %y, <8 x i64> %load2 + store <8 x i64> %sel, ptr %ptr1, align 32 + store <8 x i64> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_unaligned_v4i32: +; SVE: // %bb.0: +; SVE-NEXT: ushll v1.4s, v1.4h, #0 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: add x8, x0, #1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: shl v1.4s, v1.4s, #31 +; SVE-NEXT: cmlt v1.4s, v1.4s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; SVE-NEXT: st1w { z0.s }, p0, [x8] +; SVE-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i32>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i32> %data, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i1> %mask) { +; SVE-LABEL: test_masked_store_unaligned_v4i64: +; SVE: // %bb.0: +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: add x8, x0, #17 +; SVE-NEXT: add x9, x0, #1 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: ushll2 v3.2d, v2.4s, #0 +; SVE-NEXT: ushll v2.2d, v2.2s, #0 +; SVE-NEXT: shl v3.2d, v3.2d, #63 +; SVE-NEXT: shl v2.2d, v2.2d, #63 +; SVE-NEXT: cmlt v3.2d, v3.2d, #0 +; SVE-NEXT: cmlt v2.2d, v2.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; SVE-NEXT: st1d { z1.d }, p1, [x8] +; SVE-NEXT: st1d { z0.d }, p0, [x9] +; SVE-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i64>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i64> %data, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_unaligned_v8i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; SVE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; SVE-NEXT: add x8, x0, #1 +; SVE-NEXT: ptrue p0.s, vl4 +; SVE-NEXT: add x9, x0, #17 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: ushll v3.4s, v3.4h, #0 +; SVE-NEXT: ushll v2.4s, v2.4h, #0 +; SVE-NEXT: shl v3.4s, v3.4s, #31 +; SVE-NEXT: shl v2.4s, v2.4s, #31 +; SVE-NEXT: cmlt v3.4s, v3.4s, #0 +; SVE-NEXT: cmlt v2.4s, v2.4s, #0 +; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0 +; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; SVE-NEXT: st1w { z0.s }, p1, [x8] +; SVE-NEXT: st1w { z1.s }, p0, [x9] +; SVE-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i1> %mask) { +; SVE-LABEL: test_masked_store_unaligned_v8i64: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d4 killed $d4 def $q4 +; SVE-NEXT: mov b5, v4.b[4] +; SVE-NEXT: mov b6, v4.b[6] +; SVE-NEXT: add x8, x0, #33 +; SVE-NEXT: mov b7, v4.b[0] +; SVE-NEXT: mov b16, v4.b[2] +; SVE-NEXT: add x9, x0, #49 +; SVE-NEXT: ptrue p0.d, vl2 +; SVE-NEXT: // kill: def $q3 killed $q3 def $z3 +; SVE-NEXT: // kill: def $q2 killed $q2 def $z2 +; SVE-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE-NEXT: mov v5.b[4], v4.b[5] +; SVE-NEXT: mov v6.b[4], v4.b[7] +; SVE-NEXT: mov v7.b[4], v4.b[1] +; SVE-NEXT: mov v16.b[4], v4.b[3] +; SVE-NEXT: ushll v4.2d, v5.2s, #0 +; SVE-NEXT: ushll v5.2d, v6.2s, #0 +; SVE-NEXT: ushll v6.2d, v7.2s, #0 +; SVE-NEXT: ushll v7.2d, v16.2s, #0 +; SVE-NEXT: shl v4.2d, v4.2d, #63 +; SVE-NEXT: shl v5.2d, v5.2d, #63 +; SVE-NEXT: shl v6.2d, v6.2d, #63 +; SVE-NEXT: shl v7.2d, v7.2d, #63 +; SVE-NEXT: cmlt v4.2d, v4.2d, #0 +; SVE-NEXT: cmlt v5.2d, v5.2d, #0 +; SVE-NEXT: cmlt v6.2d, v6.2d, #0 +; SVE-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; SVE-NEXT: cmlt v4.2d, v7.2d, #0 +; SVE-NEXT: cmpne p2.d, p0/z, z5.d, #0 +; SVE-NEXT: cmpne p3.d, p0/z, z6.d, #0 +; SVE-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; SVE-NEXT: st1d { z2.d }, p1, [x8] +; SVE-NEXT: add x8, x0, #1 +; SVE-NEXT: st1d { z3.d }, p2, [x9] +; SVE-NEXT: add x9, x0, #17 +; SVE-NEXT: st1d { z0.d }, p3, [x8] +; SVE-NEXT: st1d { z1.d }, p0, [x9] +; SVE-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i64>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i64> %data, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 9efe0b33910c8..122dc57d79473 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -36,9 +36,8 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -57,12 +56,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: fcmne p1.h, p0/z, z0.h, z1.h +; VBITS_GE_256-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32f16: @@ -70,9 +67,8 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: fcmne p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, ptr %a %op2 = load <32 x half>, ptr %b @@ -88,9 +84,8 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %op2 = load <64 x half>, ptr %b @@ -106,9 +101,8 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %op2 = load <128 x half>, ptr %b @@ -149,9 +143,8 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -170,12 +163,10 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: fcmne p1.s, p0/z, z0.s, z1.s +; VBITS_GE_256-NEXT: fcmne p0.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16f32: @@ -183,9 +174,8 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, ptr %a %op2 = load <16 x float>, ptr %b @@ -201,9 +191,8 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %op2 = load <32 x float>, ptr %b @@ -219,9 +208,8 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %op2 = load <64 x float>, ptr %b @@ -263,9 +251,8 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -284,12 +271,10 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: fcmne p1.d, p0/z, z0.d, z1.d +; VBITS_GE_256-NEXT: fcmne p0.d, p0/z, z2.d, z3.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8f64: @@ -297,9 +282,8 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: fcmne p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, ptr %a %op2 = load <8 x double>, ptr %b @@ -315,9 +299,8 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %op2 = load <16 x double>, ptr %b @@ -333,9 +316,8 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %op2 = load <32 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 9cebbc4aab9b7..291cddf2b8912 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -35,9 +35,8 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -56,12 +55,10 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b -; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b -; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p1.b, p0/z, z0.b, z1.b +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z2.b, z3.b +; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v64i8: @@ -69,9 +66,8 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -87,9 +83,8 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -105,9 +100,8 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b -; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -148,9 +142,8 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -169,12 +162,10 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p1.h, p0/z, z0.h, z1.h +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, z3.h +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32i16: @@ -182,9 +173,8 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -200,9 +190,8 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -218,9 +207,8 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -261,9 +249,8 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -282,12 +269,10 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p1.s, p0/z, z0.s, z1.s +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16i32: @@ -295,9 +280,8 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b @@ -313,9 +297,8 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -331,9 +314,8 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a %op2 = load <64 x i32>, ptr %b @@ -375,9 +357,8 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -396,12 +377,10 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, z1.d +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z2.d, z3.d +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8i64: @@ -409,9 +388,8 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, ptr %a %op2 = load <8 x i64>, ptr %b @@ -427,9 +405,8 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a %op2 = load <16 x i64>, ptr %b @@ -445,9 +422,8 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: st1d { z1.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i64>, ptr %a %op2 = load <32 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index c48ee3939bd2e..2eff6da0866f8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -30,9 +30,7 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: // %bb.1: // %vector.body ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: ldr z4, [x0] -; CHECK-NEXT: ldr z5, [x0, #2, mul vl] -; CHECK-NEXT: ldr z6, [x0, #3, mul vl] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: mov v1.b[1], v0.b[1] ; CHECK-NEXT: fmov s2, w8 @@ -62,20 +60,20 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: bic z1.d, z4.d, z1.d +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 ; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: ldr z4, [x0, #1, mul vl] +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: bic z0.d, z5.d, z0.d +; CHECK-NEXT: st1w { z1.s }, p1, [x0] +; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: bic z1.d, z4.d, z2.d -; CHECK-NEXT: str z0, [x0, #2, mul vl] -; CHECK-NEXT: bic z3.d, z6.d, z3.d -; CHECK-NEXT: str z1, [x0, #1, mul vl] -; CHECK-NEXT: str z3, [x0, #3, mul vl] +; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-NEXT: st1w { z1.s }, p2, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z1.s }, p3, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ec0693a541e44..c43e929f47848 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -194,14 +194,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z0.h, p1/m, z1.h -; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z3.h }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v16f16: @@ -429,14 +429,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: mov z0.s, p1/m, z1.s -; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: fcmne p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v8f32: @@ -553,14 +553,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: fcmne p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: fcmne p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; CHECK-NEXT: st1d { z3.d }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v4f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 39701131d7db6..3787b23547afc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -288,14 +288,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b -; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b -; CHECK-NEXT: mov z0.b, p1/m, z1.b -; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, z0.b +; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0, x8] +; CHECK-NEXT: st1b { z3.b }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v32i8: @@ -692,14 +692,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z0.h, p1/m, z1.h -; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: cmpne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; CHECK-NEXT: st1h { z3.h }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v16i16: @@ -906,14 +906,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s -; CHECK-NEXT: mov z0.s, p1/m, z1.s -; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, z0.s +; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; CHECK-NEXT: st1w { z3.s }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v8i32: @@ -1039,14 +1039,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, z0.d +; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; CHECK-NEXT: st1d { z3.d }, p0, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: select_v4i64: diff --git a/llvm/test/CodeGen/RISCV/combine-storetomstore.ll b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll new file mode 100644 index 0000000000000..c7d1f76e73cf2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/combine-storetomstore.ll @@ -0,0 +1,684 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64-- -mattr=+m,+v,+f | FileCheck %s -check-prefix=RISCV + +define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4i8: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RISCV-NEXT: vse8.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x i8>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i8> %x, <4 x i8> %load + store <4 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4i16: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RISCV-NEXT: vse16.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x i16>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load + store <4 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4i64: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RISCV-NEXT: vse64.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x i64>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4f16: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RISCV-NEXT: vmv1r.v v9, v0 +; RISCV-NEXT: vfirst.m a3, v0 +; RISCV-NEXT: mv a2, a0 +; RISCV-NEXT: beqz a3, .LBB4_2 +; RISCV-NEXT: # %bb.1: +; RISCV-NEXT: mv a2, a1 +; RISCV-NEXT: .LBB4_2: +; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RISCV-NEXT: vmv.v.i v8, 0 +; RISCV-NEXT: vmv1r.v v0, v9 +; RISCV-NEXT: vmerge.vim v8, v8, 1, v0 +; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RISCV-NEXT: vslidedown.vi v8, v8, 2 +; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RISCV-NEXT: vmsne.vi v8, v8, 0 +; RISCV-NEXT: vmv.v.i v10, 0 +; RISCV-NEXT: vmv1r.v v0, v8 +; RISCV-NEXT: vmerge.vim v11, v10, 1, v0 +; RISCV-NEXT: vslidedown.vi v11, v11, 1 +; RISCV-NEXT: vmv.x.s a3, v11 +; RISCV-NEXT: andi a3, a3, 1 +; RISCV-NEXT: bnez a3, .LBB4_4 +; RISCV-NEXT: # %bb.3: +; RISCV-NEXT: addi a3, a1, 6 +; RISCV-NEXT: j .LBB4_5 +; RISCV-NEXT: .LBB4_4: +; RISCV-NEXT: addi a3, a0, 24 +; RISCV-NEXT: .LBB4_5: +; RISCV-NEXT: vmv1r.v v0, v9 +; RISCV-NEXT: vmerge.vim v9, v10, 1, v0 +; RISCV-NEXT: vslidedown.vi v9, v9, 1 +; RISCV-NEXT: vmv.x.s a4, v9 +; RISCV-NEXT: andi a4, a4, 1 +; RISCV-NEXT: bnez a4, .LBB4_7 +; RISCV-NEXT: # %bb.6: +; RISCV-NEXT: addi a5, a1, 2 +; RISCV-NEXT: j .LBB4_8 +; RISCV-NEXT: .LBB4_7: +; RISCV-NEXT: addi a5, a0, 8 +; RISCV-NEXT: .LBB4_8: +; RISCV-NEXT: lh a4, 0(a2) +; RISCV-NEXT: lh a2, 0(a3) +; RISCV-NEXT: lh a3, 0(a5) +; RISCV-NEXT: vfirst.m a5, v8 +; RISCV-NEXT: beqz a5, .LBB4_10 +; RISCV-NEXT: # %bb.9: +; RISCV-NEXT: addi a0, a1, 4 +; RISCV-NEXT: j .LBB4_11 +; RISCV-NEXT: .LBB4_10: +; RISCV-NEXT: addi a0, a0, 16 +; RISCV-NEXT: .LBB4_11: +; RISCV-NEXT: lh a0, 0(a0) +; RISCV-NEXT: sh a4, 0(a1) +; RISCV-NEXT: sh a3, 2(a1) +; RISCV-NEXT: sh a0, 4(a1) +; RISCV-NEXT: sh a2, 6(a1) +; RISCV-NEXT: ret + %load = load <4 x half>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load + store <4 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4f32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x float>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load + store <4 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v4f64: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RISCV-NEXT: vse64.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x double>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load + store <4 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8i8: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RISCV-NEXT: vse8.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i8>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load + store <8 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8i16: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RISCV-NEXT: vse16.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i16>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load + store <8 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8i64: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RISCV-NEXT: vse64.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i64>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8f16: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RISCV-NEXT: vmv1r.v v8, v0 +; RISCV-NEXT: vfirst.m a3, v0 +; RISCV-NEXT: mv a2, a0 +; RISCV-NEXT: beqz a3, .LBB11_2 +; RISCV-NEXT: # %bb.1: +; RISCV-NEXT: mv a2, a1 +; RISCV-NEXT: .LBB11_2: +; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RISCV-NEXT: vmv.v.i v9, 0 +; RISCV-NEXT: vmv1r.v v0, v8 +; RISCV-NEXT: vmerge.vim v9, v9, 1, v0 +; RISCV-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RISCV-NEXT: vslidedown.vi v9, v9, 4 +; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RISCV-NEXT: vmsne.vi v11, v9, 0 +; RISCV-NEXT: vmv.v.i v10, 0 +; RISCV-NEXT: vmv1r.v v0, v11 +; RISCV-NEXT: vmerge.vim v9, v10, 1, v0 +; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RISCV-NEXT: vslidedown.vi v9, v9, 2 +; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RISCV-NEXT: vmsne.vi v9, v9, 0 +; RISCV-NEXT: vmv.v.i v12, 0 +; RISCV-NEXT: vmv1r.v v0, v9 +; RISCV-NEXT: vmerge.vim v13, v12, 1, v0 +; RISCV-NEXT: vslidedown.vi v13, v13, 1 +; RISCV-NEXT: vmv.x.s a3, v13 +; RISCV-NEXT: andi a3, a3, 1 +; RISCV-NEXT: bnez a3, .LBB11_4 +; RISCV-NEXT: # %bb.3: +; RISCV-NEXT: addi a3, a1, 14 +; RISCV-NEXT: j .LBB11_5 +; RISCV-NEXT: .LBB11_4: +; RISCV-NEXT: addi a3, a0, 56 +; RISCV-NEXT: .LBB11_5: +; RISCV-NEXT: vmv1r.v v0, v8 +; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RISCV-NEXT: vmerge.vim v10, v10, 1, v0 +; RISCV-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RISCV-NEXT: vslidedown.vi v10, v10, 2 +; RISCV-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RISCV-NEXT: vmsne.vi v10, v10, 0 +; RISCV-NEXT: vmv1r.v v0, v10 +; RISCV-NEXT: vmerge.vim v13, v12, 1, v0 +; RISCV-NEXT: vslidedown.vi v13, v13, 1 +; RISCV-NEXT: vmv.x.s a4, v13 +; RISCV-NEXT: andi a4, a4, 1 +; RISCV-NEXT: bnez a4, .LBB11_8 +; RISCV-NEXT: # %bb.6: +; RISCV-NEXT: addi a4, a1, 6 +; RISCV-NEXT: vfirst.m a5, v11 +; RISCV-NEXT: bnez a5, .LBB11_9 +; RISCV-NEXT: .LBB11_7: +; RISCV-NEXT: addi a5, a0, 32 +; RISCV-NEXT: j .LBB11_10 +; RISCV-NEXT: .LBB11_8: +; RISCV-NEXT: addi a4, a0, 24 +; RISCV-NEXT: vfirst.m a5, v11 +; RISCV-NEXT: beqz a5, .LBB11_7 +; RISCV-NEXT: .LBB11_9: +; RISCV-NEXT: addi a5, a1, 8 +; RISCV-NEXT: .LBB11_10: +; RISCV-NEXT: vmv1r.v v0, v11 +; RISCV-NEXT: vmerge.vim v11, v12, 1, v0 +; RISCV-NEXT: vslidedown.vi v11, v11, 1 +; RISCV-NEXT: vmv.x.s a6, v11 +; RISCV-NEXT: andi a6, a6, 1 +; RISCV-NEXT: bnez a6, .LBB11_14 +; RISCV-NEXT: # %bb.11: +; RISCV-NEXT: addi a6, a1, 10 +; RISCV-NEXT: vfirst.m a7, v9 +; RISCV-NEXT: bnez a7, .LBB11_15 +; RISCV-NEXT: .LBB11_12: +; RISCV-NEXT: addi a7, a0, 48 +; RISCV-NEXT: vfirst.m t0, v10 +; RISCV-NEXT: bnez t0, .LBB11_16 +; RISCV-NEXT: .LBB11_13: +; RISCV-NEXT: addi t1, a0, 16 +; RISCV-NEXT: j .LBB11_17 +; RISCV-NEXT: .LBB11_14: +; RISCV-NEXT: addi a6, a0, 40 +; RISCV-NEXT: vfirst.m a7, v9 +; RISCV-NEXT: beqz a7, .LBB11_12 +; RISCV-NEXT: .LBB11_15: +; RISCV-NEXT: addi a7, a1, 12 +; RISCV-NEXT: vfirst.m t0, v10 +; RISCV-NEXT: beqz t0, .LBB11_13 +; RISCV-NEXT: .LBB11_16: +; RISCV-NEXT: addi t1, a1, 4 +; RISCV-NEXT: .LBB11_17: +; RISCV-NEXT: vmv1r.v v0, v8 +; RISCV-NEXT: lh t0, 0(a2) +; RISCV-NEXT: lh a2, 0(a3) +; RISCV-NEXT: lh a3, 0(a4) +; RISCV-NEXT: lh a4, 0(a5) +; RISCV-NEXT: lh a5, 0(a6) +; RISCV-NEXT: lh a6, 0(a7) +; RISCV-NEXT: lh a7, 0(t1) +; RISCV-NEXT: vmerge.vim v8, v12, 1, v0 +; RISCV-NEXT: vslidedown.vi v8, v8, 1 +; RISCV-NEXT: vmv.x.s t1, v8 +; RISCV-NEXT: andi t1, t1, 1 +; RISCV-NEXT: bnez t1, .LBB11_19 +; RISCV-NEXT: # %bb.18: +; RISCV-NEXT: addi a0, a1, 2 +; RISCV-NEXT: j .LBB11_20 +; RISCV-NEXT: .LBB11_19: +; RISCV-NEXT: addi a0, a0, 8 +; RISCV-NEXT: .LBB11_20: +; RISCV-NEXT: lh a0, 0(a0) +; RISCV-NEXT: sh t0, 0(a1) +; RISCV-NEXT: sh a0, 2(a1) +; RISCV-NEXT: sh a7, 4(a1) +; RISCV-NEXT: sh a3, 6(a1) +; RISCV-NEXT: sh a4, 8(a1) +; RISCV-NEXT: sh a5, 10(a1) +; RISCV-NEXT: sh a6, 12(a1) +; RISCV-NEXT: sh a2, 14(a1) +; RISCV-NEXT: ret + %load = load <8 x half>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load + store <8 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8f32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x float>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load + store <8 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v8f64: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RISCV-NEXT: vse64.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x double>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load + store <8 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v16i8: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RISCV-NEXT: vse8.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <16 x i8>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load + store <16 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v16i16: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RISCV-NEXT: vse16.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <16 x i16>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %load + store <16 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v16i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %load + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v32i8: +; RISCV: # %bb.0: +; RISCV-NEXT: li a1, 32 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vse8.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <32 x i8>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %load + store <32 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v32i16: +; RISCV: # %bb.0: +; RISCV-NEXT: li a1, 32 +; RISCV-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RISCV-NEXT: vse16.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <32 x i16>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %load + store <32 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_v64i8: +; RISCV: # %bb.0: +; RISCV-NEXT: li a1, 64 +; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RISCV-NEXT: vse8.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <64 x i8>, ptr %ptr, align 32 + %sel = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %load + store <64 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_invert_mask_v4i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RISCV-NEXT: vmnot.m v0, v0 +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> %x + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_invert_mask_v8i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RISCV-NEXT: vmnot.m v0, v0 +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %load, <8 x i32> %x + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; RISCV-LABEL: test_masked_store_success_invert_mask_v16i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RISCV-NEXT: vmnot.m v0, v0 +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: ret + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %load, <16 x i32> %x + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_zextload(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_zextload: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RISCV-NEXT: vle32.v v12, (a0) +; RISCV-NEXT: vzext.vf2 v10, v12 +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse64.v v8, (a0) +; RISCV-NEXT: ret + %load = load <4 x i32>, ptr %ptr, align 32 + %zext = zext <4 x i32> %load to <4 x i64> + %masked = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %zext + store <4 x i64> %masked, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_volatile_load: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v10, (a0) +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: ret + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_volatile_store: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v10, (a0) +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) nounwind { +; RISCV-LABEL: test_masked_store_intervening: +; RISCV: # %bb.0: +; RISCV-NEXT: addi sp, sp, -32 +; RISCV-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RISCV-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a2, a1, 2 +; RISCV-NEXT: add a1, a2, a1 +; RISCV-NEXT: sub sp, sp, a1 +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a1, a1, 2 +; RISCV-NEXT: add a1, sp, a1 +; RISCV-NEXT: addi a1, a1, 16 +; RISCV-NEXT: vs1r.v v0, (a1) # vscale x 8-byte Folded Spill +; RISCV-NEXT: mv s0, a0 +; RISCV-NEXT: csrr a1, vlenb +; RISCV-NEXT: slli a1, a1, 1 +; RISCV-NEXT: add a1, sp, a1 +; RISCV-NEXT: addi a1, a1, 16 +; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vle32.v v8, (a0) +; RISCV-NEXT: addi a1, sp, 16 +; RISCV-NEXT: vs2r.v v8, (a1) # vscale x 16-byte Folded Spill +; RISCV-NEXT: vmv.v.i v8, 0 +; RISCV-NEXT: vse32.v v8, (a0) +; RISCV-NEXT: call use_vec +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a0, a0, 2 +; RISCV-NEXT: add a0, sp, a0 +; RISCV-NEXT: addi a0, a0, 16 +; RISCV-NEXT: vl1r.v v0, (a0) # vscale x 8-byte Folded Reload +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a0, a0, 1 +; RISCV-NEXT: add a0, sp, a0 +; RISCV-NEXT: addi a0, a0, 16 +; RISCV-NEXT: vl2r.v v8, (a0) # vscale x 16-byte Folded Reload +; RISCV-NEXT: addi a0, sp, 16 +; RISCV-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vse32.v v8, (s0) +; RISCV-NEXT: csrr a0, vlenb +; RISCV-NEXT: slli a1, a0, 2 +; RISCV-NEXT: add a0, a1, a0 +; RISCV-NEXT: add sp, sp, a0 +; RISCV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RISCV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RISCV-NEXT: addi sp, sp, 32 +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; RISCV-LABEL: test_masked_store_multiple_v8i32: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmv1r.v v13, v0 +; RISCV-NEXT: vle32.v v14, (a1) +; RISCV-NEXT: vmv1r.v v0, v12 +; RISCV-NEXT: vmerge.vvm v10, v14, v10, v0 +; RISCV-NEXT: vmv1r.v v0, v13 +; RISCV-NEXT: vse32.v v8, (a0), v0.t +; RISCV-NEXT: vse32.v v10, (a1) +; RISCV-NEXT: ret + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; RISCV-LABEL: test_masked_store_multiple_v8i64: +; RISCV: # %bb.0: +; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RISCV-NEXT: vmv1r.v v17, v0 +; RISCV-NEXT: vle64.v v20, (a1) +; RISCV-NEXT: vmv1r.v v0, v16 +; RISCV-NEXT: vmerge.vvm v12, v20, v12, v0 +; RISCV-NEXT: vmv1r.v v0, v17 +; RISCV-NEXT: vse64.v v8, (a0), v0.t +; RISCV-NEXT: vse64.v v12, (a1) +; RISCV-NEXT: ret + %load = load <8 x i64>, ptr %ptr1, align 32 + %load2 = load <8 x i64>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + %sel2 = select <8 x i1> %mask2, <8 x i64> %y, <8 x i64> %load2 + store <8 x i64> %sel, ptr %ptr1, align 32 + store <8 x i64> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_unaligned_v4i32: +; RISCV: # %bb.0: +; RISCV-NEXT: addi a0, a0, 1 +; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RISCV-NEXT: vle8.v v9, (a0) +; RISCV-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v9, v8, v0 +; RISCV-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RISCV-NEXT: vse8.v v8, (a0) +; RISCV-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i32>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i32> %data, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i1> %mask) { +; RISCV-LABEL: test_masked_store_unaligned_v4i64: +; RISCV: # %bb.0: +; RISCV-NEXT: addi a0, a0, 1 +; RISCV-NEXT: li a1, 32 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vle8.v v10, (a0) +; RISCV-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vse8.v v8, (a0) +; RISCV-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i64>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i64> %data, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_unaligned_v8i32: +; RISCV: # %bb.0: +; RISCV-NEXT: addi a0, a0, 1 +; RISCV-NEXT: li a1, 32 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vle8.v v10, (a0) +; RISCV-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v10, v8, v0 +; RISCV-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; RISCV-NEXT: vse8.v v8, (a0) +; RISCV-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i1> %mask) { +; RISCV-LABEL: test_masked_store_unaligned_v8i64: +; RISCV: # %bb.0: +; RISCV-NEXT: addi a0, a0, 1 +; RISCV-NEXT: li a1, 64 +; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RISCV-NEXT: vle8.v v12, (a0) +; RISCV-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RISCV-NEXT: vmerge.vvm v8, v12, v8, v0 +; RISCV-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; RISCV-NEXT: vse8.v v8, (a0) +; RISCV-NEXT: ret + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i64>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i64> %data, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 346e40ab0afe5..02825b2bda484 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -5427,18 +5427,18 @@ for.cond.cleanup: ; preds = %vector.body define void @sink_splat_select_op1(ptr nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_select_op1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: li a3, 42 +; CHECK-NEXT: lui a3, 1 +; CHECK-NEXT: li a2, 42 +; CHECK-NEXT: add a3, a0, a3 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: .LBB117_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmseq.vx v0, v8, a3 -; CHECK-NEXT: vmerge.vxm v8, v8, a1, v0 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmseq.vx v0, v9, a2 +; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: bne a0, a2, .LBB117_1 +; CHECK-NEXT: bne a0, a3, .LBB117_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -5472,9 +5472,8 @@ define void @sink_splat_select_op2(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB118_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmseq.vx v0, v9, a2 -; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0 -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vmsne.vx v0, v9, a2 +; CHECK-NEXT: vse32.v v8, (a0), v0.t ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: bne a0, a1, .LBB118_1 ; CHECK-NEXT: # %bb.2: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index ba2cacc087b36..2f86499a2df9e 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1974,9 +1974,8 @@ define void @bcast_unfold_fmax_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB60_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB60_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2007,9 +2006,8 @@ define void @bcast_unfold_fmax_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB61_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB61_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2041,9 +2039,8 @@ define void @bcast_unfold_fmax_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB62_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpnltps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB62_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2076,9 +2073,8 @@ define void @bcast_unfold_fmax_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB63_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB63_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2109,9 +2105,8 @@ define void @bcast_unfold_fmax_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB64_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB64_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2143,9 +2138,8 @@ define void @bcast_unfold_fmax_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB65_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpnltpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB65_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2177,9 +2171,8 @@ define void @bcast_unfold_fmin_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB66_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB66_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2210,9 +2203,8 @@ define void @bcast_unfold_fmin_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB67_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB67_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2244,9 +2236,8 @@ define void @bcast_unfold_fmin_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB68_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB68_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2279,9 +2270,8 @@ define void @bcast_unfold_fmin_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB69_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB69_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2312,9 +2302,8 @@ define void @bcast_unfold_fmin_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB70_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB70_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2346,9 +2335,8 @@ define void @bcast_unfold_fmin_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB71_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB71_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3161,13 +3149,12 @@ define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB96_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB96_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3195,13 +3182,12 @@ define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB97_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB97_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3230,13 +3216,12 @@ define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB98_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpltd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB98_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3265,13 +3250,12 @@ define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB99_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB99_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3299,13 +3283,12 @@ define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB100_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB100_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3334,13 +3317,12 @@ define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB101_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpltq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB101_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3369,13 +3351,12 @@ define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB102_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB102_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3403,13 +3384,12 @@ define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB103_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB103_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3438,13 +3418,12 @@ define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB104_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vpcmpeqd 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB104_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3473,13 +3452,12 @@ define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB105_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB105_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3507,13 +3485,12 @@ define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB106_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB106_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3542,13 +3519,12 @@ define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB107_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vpcmpeqq 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB107_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -3577,13 +3553,12 @@ define void @bcast_unfold_pcmp_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB108_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB108_1 @@ -3612,13 +3587,12 @@ define void @bcast_unfold_pcmp_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB109_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB109_1 @@ -3648,13 +3622,12 @@ define void @bcast_unfold_pcmp_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB110_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpgtd (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB110_1 @@ -3684,13 +3657,12 @@ define void @bcast_unfold_pcmp_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB111_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB111_1 @@ -3719,13 +3691,12 @@ define void @bcast_unfold_pcmp_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB112_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB112_1 @@ -3755,13 +3726,12 @@ define void @bcast_unfold_pcmp_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB113_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpgtq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: jg .LBB113_1 @@ -3791,13 +3761,12 @@ define void @bcast_unfold_pcmpu_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB114_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 -; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB114_1 @@ -3826,13 +3795,12 @@ define void @bcast_unfold_pcmpu_v8i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB115_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 -; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %ymm0, %k1 +; CHECK-NEXT: vmovdqu32 %ymm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB115_1 @@ -3862,13 +3830,12 @@ define void @bcast_unfold_pcmpu_v16i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB116_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 -; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) +; CHECK-NEXT: vpcmpnleud (%rdi,%rax,4), %zmm0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, (%rdi,%rax,4) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB116_1 @@ -3898,13 +3865,12 @@ define void @bcast_unfold_pcmpu_v2i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB117_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 -; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3] -; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %xmm0, %k1 +; CHECK-NEXT: vmovdqu64 %xmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $2, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB117_1 @@ -3933,13 +3899,12 @@ define void @bcast_unfold_pcmpu_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB118_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 -; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $4, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB118_1 @@ -3969,13 +3934,12 @@ define void @bcast_unfold_pcmpu_v8i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 = [3,3,3,3,3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB119_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 -; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3] -; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) +; CHECK-NEXT: vpcmpnleuq (%rdi,%rax,8), %zmm0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) {%k1} ; CHECK-NEXT: addq $8, %rax ; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF ; CHECK-NEXT: ja .LBB119_1 @@ -4009,10 +3973,8 @@ define void @bcast_unfold_cmp_v4f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB120_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2 -; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB120_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4044,10 +4006,8 @@ define void @bcast_unfold_cmp_v8f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB121_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2 -; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB121_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4080,10 +4040,8 @@ define void @bcast_unfold_cmp_v16f32(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB122_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2 -; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax) +; CHECK-NEXT: vcmpngtps 4096(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB122_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4118,10 +4076,8 @@ define void @bcast_unfold_cmp_v2f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB123_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2 -; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1 -; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB123_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4153,10 +4109,8 @@ define void @bcast_unfold_cmp_v4f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB124_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2 -; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1 -; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1} -; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB124_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4189,10 +4143,8 @@ define void @bcast_unfold_cmp_v8f64(ptr %arg) { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB125_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2 -; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1} -; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax) +; CHECK-NEXT: vcmpngtpd 8192(%rdi,%rax), %zmm0, %k1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB125_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4254,13 +4206,12 @@ define void @bcast_unfold_ptestm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB127_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB127_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4289,13 +4240,12 @@ define void @bcast_unfold_ptestnm_v4i32(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB128_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vptestnmd 4096(%rdi,%rax), %xmm0, %k1 +; CHECK-NEXT: vmovdqu32 %xmm1, 4096(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB128_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4324,13 +4274,12 @@ define void @bcast_unfold_ptestm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB129_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB129_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4360,13 +4309,12 @@ define void @bcast_unfold_ptestnm_v4i64(ptr %arg) { ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 ; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] +; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB130_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3] -; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vptestnmq 8192(%rdi,%rax), %ymm0, %k1 +; CHECK-NEXT: vmovdqu64 %ymm1, 8192(%rdi,%rax) {%k1} ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB130_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/combine-storetomstore.ll b/llvm/test/CodeGen/X86/combine-storetomstore.ll new file mode 100644 index 0000000000000..c18c89dfdf684 --- /dev/null +++ b/llvm/test/CodeGen/X86/combine-storetomstore.ll @@ -0,0 +1,1540 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s -check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefix=AVX512 + +define void @test_masked_store_success_v4i8(<4 x i8> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovd %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x i8>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i8> %x, <4 x i8> %load + store <4 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u] +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,4,u,8,u,12,u,8,u,12,u,12,u,14,u] +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x i16>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load + store <4 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x i64>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f16: +; AVX: # %bb.0: +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX-NEXT: vpextrw $0, %xmm2, %edx +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpextrw $0, %xmm2, %ecx +; AVX-NEXT: movzwl 2(%rdi), %eax +; AVX-NEXT: vpextrb $4, %xmm1, %esi +; AVX-NEXT: testb $1, %sil +; AVX-NEXT: cmovnel %ecx, %eax +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: testb $1, %cl +; AVX-NEXT: jne .LBB4_1 +; AVX-NEXT: # %bb.2: +; AVX-NEXT: movl 4(%rdi), %ecx +; AVX-NEXT: jmp .LBB4_3 +; AVX-NEXT: .LBB4_1: +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-NEXT: vpextrw $0, %xmm2, %ecx +; AVX-NEXT: .LBB4_3: +; AVX-NEXT: movzwl 6(%rdi), %esi +; AVX-NEXT: vpextrb $12, %xmm1, %r8d +; AVX-NEXT: testb $1, %r8b +; AVX-NEXT: cmovnel %edx, %esi +; AVX-NEXT: vmovd %xmm1, %edx +; AVX-NEXT: testb $1, %dl +; AVX-NEXT: jne .LBB4_4 +; AVX-NEXT: # %bb.5: +; AVX-NEXT: movl (%rdi), %edx +; AVX-NEXT: jmp .LBB4_6 +; AVX-NEXT: .LBB4_4: +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: .LBB4_6: +; AVX-NEXT: movw %dx, (%rdi) +; AVX-NEXT: movw %si, 6(%rdi) +; AVX-NEXT: movw %cx, 4(%rdi) +; AVX-NEXT: movw %ax, 2(%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX2-NEXT: vpextrw $0, %xmm2, %edx +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX2-NEXT: vpextrw $0, %xmm2, %ecx +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vpextrb $4, %xmm1, %esi +; AVX2-NEXT: testb $1, %sil +; AVX2-NEXT: cmovnel %ecx, %eax +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: testb $1, %cl +; AVX2-NEXT: jne .LBB4_1 +; AVX2-NEXT: # %bb.2: +; AVX2-NEXT: movl 4(%rdi), %ecx +; AVX2-NEXT: jmp .LBB4_3 +; AVX2-NEXT: .LBB4_1: +; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vpextrw $0, %xmm2, %ecx +; AVX2-NEXT: .LBB4_3: +; AVX2-NEXT: movzwl 6(%rdi), %esi +; AVX2-NEXT: vpextrb $12, %xmm1, %r8d +; AVX2-NEXT: testb $1, %r8b +; AVX2-NEXT: cmovnel %edx, %esi +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: jne .LBB4_4 +; AVX2-NEXT: # %bb.5: +; AVX2-NEXT: movl (%rdi), %edx +; AVX2-NEXT: jmp .LBB4_6 +; AVX2-NEXT: .LBB4_4: +; AVX2-NEXT: vpextrw $0, %xmm0, %edx +; AVX2-NEXT: .LBB4_6: +; AVX2-NEXT: movw %dx, (%rdi) +; AVX2-NEXT: movw %si, 6(%rdi) +; AVX2-NEXT: movw %cx, 4(%rdi) +; AVX2-NEXT: movw %ax, 2(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <4 x half>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load + store <4 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x float>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load + store <4 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vmaskmovpd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v4f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovapd %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x double>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load + store <4 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovq %xmm1, (%rdi) +; AVX512-NEXT: retq + %load = load <8 x i8>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load + store <8 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <8 x i16>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load + store <8 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i64>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f16: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <8 x half>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load + store <8 x half> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x float>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load + store <8 x float> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v8f64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v8f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vmaskmovpd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmaskmovpd %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v8f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovupd %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x double>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load + store <8 x double> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <16 x i8>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load + store <16 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i16: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX-NEXT: vpsraw $15, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vmovdqa %xmm1, (%rdi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i16>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %load + store <16 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmaskmovps %ymm1, %ymm3, 32(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm3, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %load + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i8(<32 x i8> %x, ptr %ptr, <32 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v32i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vmovdqa (%rdi), %xmm4 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm5, %xmm1 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX512-NEXT: vpmovb2m %ymm1, %k1 +; AVX512-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <32 x i8>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %load + store <32 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v32i16(<32 x i16> %x, ptr %ptr, <32 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v32i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX-NEXT: vpsllw $15, %xmm8, %xmm8 +; AVX-NEXT: vpsraw $15, %xmm8, %xmm8 +; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm5, %xmm5 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NEXT: vpsllw $15, %xmm8, %xmm8 +; AVX-NEXT: vpsraw $15, %xmm8, %xmm8 +; AVX-NEXT: vpblendvb %xmm8, %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm7, %xmm7 +; AVX-NEXT: vpsraw $15, %xmm7, %xmm7 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm7, %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX-NEXT: vpsraw $15, %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa %xmm3, (%rdi) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm5, 32(%rdi) +; AVX-NEXT: vmovdqa %xmm1, 48(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v32i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX2-NEXT: vpsllw $15, %ymm5, %ymm5 +; AVX2-NEXT: vpsraw $15, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpsllw $15, %ymm2, %ymm2 +; AVX2-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %ymm1, %ymm1 +; AVX512-NEXT: vpmovb2m %ymm1, %k1 +; AVX512-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <32 x i16>, ptr %ptr, align 32 + %sel = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %load + store <32 x i16> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_v64i8(<64 x i8> %x, ptr %ptr, <64 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_v64i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm2 +; AVX-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 +; AVX-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX-NEXT: vpsllw $7, %xmm4, %xmm4 +; AVX-NEXT: vmovd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm5, %xmm5 +; AVX-NEXT: vpsllw $7, %xmm5, %xmm5 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX-NEXT: vmovdqa (%rdi), %xmm7 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX-NEXT: vpblendvb %xmm5, %xmm6, %xmm10, %xmm5 +; AVX-NEXT: vpblendvb %xmm4, %xmm1, %xmm9, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX-NEXT: vpblendvb %xmm3, %xmm4, %xmm8, %xmm3 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm7, %xmm0 +; AVX-NEXT: vmovdqa %xmm3, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdi) +; AVX-NEXT: vmovdqa %xmm5, 48(%rdi) +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX2-NEXT: vmovd %esi, %xmm3 +; AVX2-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $3, %r8d, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $4, %r9d, %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpsllw $7, %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %zmm1, %zmm1 +; AVX512-NEXT: vpmovb2m %zmm1, %k1 +; AVX512-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <64 x i8>, ptr %ptr, align 32 + %sel = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %load + store <64 x i8> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %sel = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> %x + store <4 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %load, <8 x i32> %x + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_success_invert_mask_v16i32(<16 x i32> %x, ptr %ptr, <16 x i1> %mask) { +; AVX-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vpxor %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vmaskmovps %ymm1, %ymm2, 32(%rdi) +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 +; AVX-NEXT: vxorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_success_invert_mask_v16i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512-NEXT: vpmovb2m %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <16 x i32>, ptr %ptr, align 32 + %sel = select <16 x i1> %mask, <16 x i32> %load, <16 x i32> %x + store <16 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_zextload(<4 x i64> %x, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_zextload: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovapd %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_zextload: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_zextload: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <4 x i32>, ptr %ptr, align 32 + %zext = zext <4 x i32> %load to <4 x i64> + %masked = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %zext + store <4 x i64> %masked, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_load(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_volatile_load: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_load: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_load: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load volatile <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +define void @test_masked_store_volatile_store(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_volatile_store: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_volatile_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_volatile_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store volatile <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + +declare void @use_vec(<8 x i32>) + +define void @test_masked_store_intervening(<8 x i32> %x, ptr %ptr, <8 x i1> %mask) nounwind { +; AVX-LABEL: test_masked_store_intervening: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $32, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovaps (%rdi), %ymm2 +; AVX-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: callq use_vec@PLT +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%rbx) +; AVX-NEXT: addq $32, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_intervening: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-NEXT: vblendvps %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: callq use_vec@PLT +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rbx) +; AVX2-NEXT: addq $32, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_intervening: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $80, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm0 +; AVX512-NEXT: vpmovw2m %xmm0, %k1 +; AVX512-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) +; AVX512-NEXT: callq use_vec@PLT +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, (%rbx) +; AVX512-NEXT: addq $80, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr, align 32 + store <8 x i32> zeroinitializer, ptr %ptr, align 32 + %tmp = load <8 x i32>, ptr %ptr + call void @use_vec(<8 x i32> %tmp) + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr, align 32 + ret void +} + + +define void @test_masked_store_multiple_v8i32(<8 x i32> %x, <8 x i32> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; AVX-LABEL: test_masked_store_multiple_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vmovaps (%rsi), %ymm4 +; AVX-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) +; AVX-NEXT: vmovaps %ymm1, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_multiple_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) +; AVX2-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_multiple_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX512-NEXT: vpmovw2m %xmm2, %k1 +; AVX512-NEXT: vpsllw $15, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vpmovw2m %xmm2, %k2 +; AVX512-NEXT: vmovdqa32 %ymm1, %ymm3 {%k2} +; AVX512-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i32>, ptr %ptr1, align 32 + %load2 = load <8 x i32>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load + %sel2 = select <8 x i1> %mask2, <8 x i32> %y, <8 x i32> %load2 + store <8 x i32> %sel, ptr %ptr1, align 32 + store <8 x i32> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_multiple_v8i64(<8 x i64> %x, <8 x i64> %y, ptr %ptr1, ptr %ptr2, <8 x i1> %mask, <8 x i1> %mask2) { +; AVX-LABEL: test_masked_store_multiple_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovapd (%rsi), %ymm6 +; AVX-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX-NEXT: vpmovsxdq %xmm8, %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm8, %xmm8 +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpmovsxdq %xmm4, %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX-NEXT: vpmovsxdq %xmm9, %xmm10 +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm9, %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX-NEXT: vblendvpd %ymm9, %ymm3, %ymm7, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX-NEXT: vpmovsxdq %xmm5, %xmm7 +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX-NEXT: vblendvpd %ymm5, %ymm2, %ymm6, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm4, (%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm8, 32(%rdi) +; AVX-NEXT: vmovapd %ymm3, 32(%rsi) +; AVX-NEXT: vmovapd %ymm2, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_multiple_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd (%rsi), %ymm6 +; AVX2-NEXT: vmovapd 32(%rsi), %ymm7 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm8, %xmm8 +; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vpslld $31, %xmm9, %xmm9 +; AVX2-NEXT: vpmovsxdq %xmm9, %ymm9 +; AVX2-NEXT: vblendvpd %ymm9, %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm4, (%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm8, 32(%rdi) +; AVX2-NEXT: vmovapd %ymm3, 32(%rsi) +; AVX2-NEXT: vmovapd %ymm2, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_multiple_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX512-NEXT: vpmovw2m %xmm2, %k1 +; AVX512-NEXT: vpsllw $15, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqu64 (%rsi), %zmm3 +; AVX512-NEXT: vpmovw2m %xmm2, %k2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vmovdqu64 %zmm3, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %load = load <8 x i64>, ptr %ptr1, align 32 + %load2 = load <8 x i64>, ptr %ptr2, align 32 + %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load + %sel2 = select <8 x i1> %mask2, <8 x i64> %y, <8 x i64> %load2 + store <8 x i64> %sel, ptr %ptr1, align 32 + store <8 x i64> %sel2, ptr %ptr2, align 32 + ret void +} + +define void @test_masked_store_unaligned_v4i32(<4 x i32> %data, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vmaskmovps %xmm0, %xmm1, 1(%rdi) +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, 1(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %xmm0, 1(%rdi) {%k1} +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i32>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i32> %data, <4 x i32> %load + store <4 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v4i64(<4 x i64> %data, ptr %ptr, <4 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vpmovsxdq %xmm1, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm1, 1(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm1, 1(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpmovd2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %ymm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <4 x i64>, ptr %ptr_vec, align 1 + %sel = select <4 x i1> %mask, <4 x i64> %data, <4 x i64> %load + store <4 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i32(<8 x i32> %data, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmaskmovps %ymm0, %ymm1, 1(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, 1(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu32 %ymm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i32 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i32>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i32> %data, <8 x i32> %load + store <8 x i32> %sel, ptr %ptr_vec, align 1 + ret void +} + +define void @test_masked_store_unaligned_v8i64(<8 x i64> %data, ptr %ptr, <8 x i1> %mask) { +; AVX-LABEL: test_masked_store_unaligned_v8i64: +; AVX: # %bb.0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX-NEXT: vpmovsxdq %xmm3, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxdq %xmm2, %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX-NEXT: vpmovsxdq %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vmaskmovpd %ymm0, %ymm2, 1(%rdi) +; AVX-NEXT: vmaskmovpd %ymm1, %ymm3, 33(%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX2-LABEL: test_masked_store_unaligned_v8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpslld $31, %xmm3, %xmm3 +; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT: vpmaskmovq %ymm0, %ymm2, 1(%rdi) +; AVX2-NEXT: vpmaskmovq %ymm1, %ymm3, 33(%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_masked_store_unaligned_v8i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpmovw2m %xmm1, %k1 +; AVX512-NEXT: vmovdqu64 %zmm0, 1(%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %ptr_i8 = getelementptr i8, ptr %ptr, i64 1 + %ptr_vec = bitcast ptr %ptr_i8 to ptr + %load = load <8 x i64>, ptr %ptr_vec, align 1 + %sel = select <8 x i1> %mask, <8 x i64> %data, <8 x i64> %load + store <8 x i64> %sel, ptr %ptr_vec, align 1 + ret void +} diff --git a/llvm/test/CodeGen/X86/pr30284.ll b/llvm/test/CodeGen/X86/pr30284.ll index f4fb1b3ce72e3..708f0f7ee72da 100644 --- a/llvm/test/CodeGen/X86/pr30284.ll +++ b/llvm/test/CodeGen/X86/pr30284.ll @@ -19,14 +19,12 @@ define void @f_f___un_3C_unf_3E_un_3C_unf_3E_(<16 x i1> %x) { ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 ; CHECK-NEXT: vpmovd2m %zmm0, %k1 -; CHECK-NEXT: vmovapd 0, %zmm0 -; CHECK-NEXT: vmovapd 64, %zmm1 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16] +; CHECK-NEXT: vporq 64, %zmm0, %zmm1 +; CHECK-NEXT: vporq 0, %zmm0, %zmm0 ; CHECK-NEXT: kshiftrw $8, %k1, %k2 -; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2} -; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1} -; CHECK-NEXT: vmovapd %zmm0, 0 -; CHECK-NEXT: vmovapd %zmm1, 64 +; CHECK-NEXT: vmovdqa64 %zmm0, 0 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, 64 {%k2} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl %a_load22 = load <16 x i64>, ptr null, align 1