diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index a3675eecfea3f..08db31c63367d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1889,6 +1889,12 @@ LLVM_ABI SDValue peekThroughExtractSubvectors(SDValue V); /// If \p V is not a truncation, it is returned as-is. LLVM_ABI SDValue peekThroughTruncates(SDValue V); +/// Recursively peek through INSERT_VECTOR_ELT nodes, returning the source +/// vector operand of \p V, as long as \p V is an INSERT_VECTOR_ELT operation +/// that do not insert into any of the demanded vector elts. +LLVM_ABI SDValue peekThroughInsertVectorElt(SDValue V, + const APInt &DemandedElts); + /// Returns true if \p V is a bitwise not operation. Assumes that an all ones /// constant is canonicalized to be operand 1. LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs = false); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 586eb2f3cf45e..09c7476919e43 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23182,6 +23182,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { auto *IndexC = dyn_cast(EltNo); // Insert into out-of-bounds element is undefined. + // Code below relies on that we handle this special case early. if (IndexC && VT.isFixedLengthVector() && IndexC->getZExtValue() >= VT.getVectorNumElements()) return DAG.getUNDEF(VT); @@ -23192,14 +23193,28 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) return InVec; - if (!IndexC) { - // If this is variable insert to undef vector, it might be better to splat: - // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > - if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) - return DAG.getSplat(VT, DL, InVal); - return SDValue(); + // If this is variable insert to undef vector, it might be better to splat: + // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... > + if (!IndexC && InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) + return DAG.getSplat(VT, DL, InVal); + + // Try to drop insert of UNDEF/POISON elements. This is also done in getNode, + // but we also do it as a DAG combine since for example simplifications into + // SPLAT_VECTOR/BUILD_VECTOR may turn poison elements into undef/zero etc, and + // then suddenly the InVec is guaranteed to not be poison. + if (InVal.isUndef()) { + if (IndexC && VT.isFixedLengthVector()) { + APInt EltMask = APInt::getOneBitSet(VT.getVectorNumElements(), + IndexC->getZExtValue()); + if (DAG.isGuaranteedNotToBePoison(InVec, EltMask)) + return InVec; + } + return DAG.getFreeze(InVec); } + if (!IndexC) + return SDValue(); + if (VT.isScalableVector()) return SDValue(); @@ -27639,18 +27654,42 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { SDValue N2 = N->getOperand(2); uint64_t InsIdx = N->getConstantOperandVal(2); - // If inserting an UNDEF, just return the original vector. - if (N1.isUndef()) - return N0; + // If inserting an UNDEF, just return the original vector (unless it makes the + // result more poisonous). + if (N1.isUndef()) { + if (N1.getOpcode() == ISD::POISON) + return N0; + if (VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1.getValueType().getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N0, EltMask)) + return N0; + } + return DAG.getFreeze(N0); + } - // If this is an insert of an extracted vector into an undef vector, we can - // just use the input to the extract if the types match, and can simplify + // If this is an insert of an extracted vector into an undef/poison vector, we + // can just use the input to the extract if the types match, and can simplify // in some cases even if they don't. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2) { + EVT N1VT = N1.getValueType(); EVT SrcVT = N1.getOperand(0).getValueType(); - if (SrcVT == VT) - return N1.getOperand(0); + if (SrcVT == VT) { + // Need to ensure that result isn't more poisonous if skipping both the + // extract+insert. + if (N0.getOpcode() == ISD::POISON) + return N1.getOperand(0); + if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) { + unsigned SubVecNumElts = N1VT.getVectorNumElements(); + APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx, + InsIdx + SubVecNumElts); + if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask)) + return N1.getOperand(0); + } else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0))) + return N1.getOperand(0); + } // TODO: To remove the zero check, need to adjust the offset to // a multiple of the new src type. if (isNullConstant(N2)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6df21b624137f..17fe550d38c55 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5456,6 +5456,60 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, } return true; + case ISD::INSERT_SUBVECTOR: { + if (Op.getValueType().isScalableVector()) + break; + SDValue Src = Op.getOperand(0); + SDValue Sub = Op.getOperand(1); + uint64_t Idx = Op.getConstantOperandVal(2); + unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); + APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx); + APInt DemandedSrcElts = DemandedElts; + DemandedSrcElts.clearBits(Idx, Idx + NumSubElts); + + if (!!DemandedSubElts && !isGuaranteedNotToBeUndefOrPoison( + Sub, DemandedSubElts, PoisonOnly, Depth + 1)) + return false; + if (!!DemandedSrcElts && !isGuaranteedNotToBeUndefOrPoison( + Src, DemandedSrcElts, PoisonOnly, Depth + 1)) + return false; + return true; + } + + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = Op.getOperand(0); + SDValue InVal = Op.getOperand(1); + SDValue EltNo = Op.getOperand(2); + EVT VT = InVec.getValueType(); + auto *IndexC = dyn_cast(EltNo); + if (IndexC && VT.isFixedLengthVector() && + IndexC->getZExtValue() < VT.getVectorNumElements()) { + if (DemandedElts[IndexC->getZExtValue()] && + !isGuaranteedNotToBeUndefOrPoison(InVal, PoisonOnly, Depth + 1)) + return false; + APInt InVecDemandedElts = DemandedElts; + InVecDemandedElts.clearBit(IndexC->getZExtValue()); + if (!!InVecDemandedElts && + !isGuaranteedNotToBeUndefOrPoison( + peekThroughInsertVectorElt(InVec, InVecDemandedElts), + InVecDemandedElts, PoisonOnly, Depth + 1)) + return false; + return true; + } + break; + } + + case ISD::SCALAR_TO_VECTOR: + // Check upper (known undef) elements. + if (DemandedElts.ugt(1) && !PoisonOnly) + return false; + // Check element zero. + if (DemandedElts[0] && !isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), + PoisonOnly, + Depth + 1)) + return false; + return true; + case ISD::SPLAT_VECTOR: return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly, Depth + 1); @@ -7952,23 +8006,42 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except // for scalable vectors where we will generate appropriate code to // deal with out-of-bounds cases correctly. - if (N3C && N1.getValueType().isFixedLengthVector() && - N3C->getZExtValue() >= N1.getValueType().getVectorNumElements()) + if (N3C && VT.isFixedLengthVector() && + N3C->getZExtValue() >= VT.getVectorNumElements()) return getUNDEF(VT); // Undefined index can be assumed out-of-bounds, so that's UNDEF too. if (N3.isUndef()) return getUNDEF(VT); - // If the inserted element is an UNDEF, just use the input vector. - if (N2.isUndef()) + // If inserting poison, just use the input vector. + if (N2.getOpcode() == ISD::POISON) return N1; + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) + return getUNDEF(VT); + + // If the inserted element is an UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (N3C && VT.isFixedLengthVector()) { + APInt EltMask = + APInt::getOneBitSet(VT.getVectorNumElements(), N3C->getZExtValue()); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::INSERT_SUBVECTOR: { - // Inserting undef into undef is still undef. - if (N1.isUndef() && N2.isUndef()) + // If inserting poison, just use the input vector, + if (N2.getOpcode() == ISD::POISON) + return N1; + + // Inserting undef into undef/poison is still undef. + if (N2.getOpcode() == ISD::UNDEF && N1.isUndef()) return getUNDEF(VT); EVT N2VT = N2.getValueType(); @@ -7997,11 +8070,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N2VT) return N2; - // If this is an insert of an extracted vector into an undef vector, we - // can just use the input to the extract. + // If this is an insert of an extracted vector into an undef/poison vector, + // we can just use the input to the extract. But not if skipping the + // extract+insert could make the result more poisonous. if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) - return N2.getOperand(0); + N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) { + if (N1.getOpcode() == ISD::POISON) + return N2.getOperand(0); + if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask)) + return N2.getOperand(0); + } else if (isGuaranteedNotToBePoison(N2.getOperand(0))) + return N2.getOperand(0); + } + + // If the inserted subvector is UNDEF, just use the input vector. + // But not if skipping the insert could make the result more poisonous. + if (N2.isUndef()) { + if (VT.isFixedLengthVector()) { + unsigned LoBit = N3->getAsZExtVal(); + unsigned HiBit = LoBit + N2VT.getVectorNumElements(); + APInt EltMask = + APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit); + if (isGuaranteedNotToBePoison(N1, EltMask)) + return N1; + } else if (isGuaranteedNotToBePoison(N1)) + return N1; + } break; } case ISD::BITCAST: @@ -12463,6 +12562,23 @@ SDValue llvm::peekThroughTruncates(SDValue V) { return V; } +SDValue llvm::peekThroughInsertVectorElt(SDValue V, const APInt &DemandedElts) { + while (V.getOpcode() == ISD::INSERT_VECTOR_ELT) { + SDValue InVec = V.getOperand(0); + SDValue EltNo = V.getOperand(2); + EVT VT = InVec.getValueType(); + auto *IndexC = dyn_cast(EltNo); + if (IndexC && VT.isFixedLengthVector() && + IndexC->getZExtValue() < VT.getVectorNumElements() && + !DemandedElts[IndexC->getZExtValue()]) { + V = InVec; + continue; + } + break; + } + return V; +} + bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) { if (V.getOpcode() != ISD::XOR) return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 000f8cc6786a5..4775b2501f8a5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3433,8 +3433,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::INSERT_SUBVECTOR: { - // Demand any elements from the subvector and the remainder from the src its - // inserted into. + // Demand any elements from the subvector and the remainder from the src it + // is inserted into. SDValue Src = Op.getOperand(0); SDValue Sub = Op.getOperand(1); uint64_t Idx = Op.getConstantOperandVal(2); @@ -3443,6 +3443,10 @@ bool TargetLowering::SimplifyDemandedVectorElts( APInt DemandedSrcElts = DemandedElts; DemandedSrcElts.clearBits(Idx, Idx + NumSubElts); + // If none of the sub operand elements are demanded, bypass the insert. + if (!DemandedSubElts) + return TLO.CombineTo(Op, Src); + APInt SubUndef, SubZero; if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll index 82802c79c7085..c6fff3e3d3181 100644 --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 // =0x33ee -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void diff --git a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll index 34899cb47dba3..545da98034527 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll @@ -94,16 +94,14 @@ define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unname ; CHECK-LABEL: combine_undef_add_8xi32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: mov v1.s[1], w1 -; CHECK-NEXT: uhadd v0.4h, v0.4h, v0.4h ; CHECK-NEXT: mov v1.s[2], w2 ; CHECK-NEXT: mov v1.s[3], w3 -; CHECK-NEXT: xtn v2.4h, v1.4s -; CHECK-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-NEXT: uhadd v1.4h, v2.4h, v1.4h -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: uaddlv s0, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %a1 = insertelement <8 x i32> poison, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 9efe0b33910c8..2905d707bdd09 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -37,6 +37,10 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -59,8 +63,15 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -71,6 +82,10 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -89,6 +104,10 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -107,6 +126,10 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -150,6 +173,10 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -172,8 +199,15 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -184,6 +218,10 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -202,6 +240,10 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -220,6 +262,10 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -264,6 +310,10 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -286,8 +336,15 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -298,6 +355,10 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -316,6 +377,10 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -334,6 +399,10 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 9cebbc4aab9b7..0e95da31c13cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -36,6 +36,10 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -58,8 +62,15 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b -; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b -; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b +; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b +; VBITS_GE_256-NEXT: mov z5.b, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1 +; VBITS_GE_256-NEXT: and z5.b, z5.b, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z4.b, #0 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z5.b, #0 +; VBITS_GE_256-NEXT: sel z0.b, p2, z0.b, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -70,6 +81,10 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; VBITS_GE_512-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.b +; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -88,6 +103,10 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -106,6 +125,10 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -149,6 +172,10 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -171,8 +198,15 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -183,6 +217,10 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -201,6 +239,10 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -219,6 +261,10 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -262,6 +308,10 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -284,8 +334,15 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -296,6 +353,10 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -314,6 +375,10 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -332,6 +397,10 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -376,6 +445,10 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -398,8 +471,15 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -410,6 +490,10 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -428,6 +512,10 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -446,6 +534,10 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 093e6cd9328c8..ebd32c73ec65b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1198,11 +1198,15 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ec0693a541e44..8b845dff64ffe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -199,6 +199,13 @@ define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -434,6 +441,13 @@ define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -558,6 +572,13 @@ define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 39701131d7db6..12b7886d76c70 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -293,6 +293,13 @@ define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: and z4.b, z4.b, #0x1 +; CHECK-NEXT: and z5.b, z5.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p0/z, z4.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z5.b, #0 ; CHECK-NEXT: mov z0.b, p1/m, z1.b ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -697,6 +704,13 @@ define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -911,6 +925,13 @@ define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1044,6 +1065,13 @@ define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index aba9056c78cda..5aa3a246d7616 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and @@ -8,20 +8,35 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse32.v v10, (a0) -; NO_FOLDING-NEXT: vse32.v v11, (a1) -; NO_FOLDING-NEXT: vse32.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse32.v v10, (a0) +; NO_FOLDING1-NEXT: vse32.v v11, (a1) +; NO_FOLDING1-NEXT: vse32.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse32.v v9, (a0) +; NO_FOLDING2-NEXT: vse32.v v11, (a1) +; NO_FOLDING2-NEXT: vse32.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; ZVFH-LABEL: vfwmul_v2f116_multiple_users: ; ZVFH: # %bb.0: @@ -61,20 +76,35 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, } define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) { -; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; NO_FOLDING-NEXT: vfwcvt.f.f.v v11, v8 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v8, v9 -; NO_FOLDING-NEXT: vfwcvt.f.f.v v9, v10 -; NO_FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; NO_FOLDING-NEXT: vfmul.vv v10, v11, v8 -; NO_FOLDING-NEXT: vfadd.vv v11, v11, v9 -; NO_FOLDING-NEXT: vfsub.vv v8, v8, v9 -; NO_FOLDING-NEXT: vse64.v v10, (a0) -; NO_FOLDING-NEXT: vse64.v v11, (a1) -; NO_FOLDING-NEXT: vse64.v v8, (a2) -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING1-NEXT: vfwcvt.f.f.v v9, v10 +; NO_FOLDING1-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING1-NEXT: vfmul.vv v10, v11, v8 +; NO_FOLDING1-NEXT: vfadd.vv v11, v11, v9 +; NO_FOLDING1-NEXT: vfsub.vv v8, v8, v9 +; NO_FOLDING1-NEXT: vse64.v v10, (a0) +; NO_FOLDING1-NEXT: vse64.v v11, (a1) +; NO_FOLDING1-NEXT: vse64.v v8, (a2) +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v11, v8 +; NO_FOLDING2-NEXT: vfwcvt.f.f.v v8, v9 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; NO_FOLDING2-NEXT: vfmul.vv v9, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; NO_FOLDING2-NEXT: vfwadd.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vfwsub.wv v8, v8, v10 +; NO_FOLDING2-NEXT: vse64.v v9, (a0) +; NO_FOLDING2-NEXT: vse64.v v11, (a1) +; NO_FOLDING2-NEXT: vse64.v v8, (a2) +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vfwmul_v2f32_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll index 227a428831b60..b093e9e35edad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING1 +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING2 ; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING ; Check that the default value enables the web folding and @@ -16,21 +16,38 @@ ; We need the web size to be at least 3 for the folding to happen, because ; %c has 3 uses. define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { -; NO_FOLDING-LABEL: vwmul_v2i16_multiple_users: -; NO_FOLDING: # %bb.0: -; NO_FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; NO_FOLDING-NEXT: vle8.v v8, (a0) -; NO_FOLDING-NEXT: vle8.v v9, (a1) -; NO_FOLDING-NEXT: vle8.v v10, (a2) -; NO_FOLDING-NEXT: vsext.vf2 v11, v8 -; NO_FOLDING-NEXT: vsext.vf2 v8, v9 -; NO_FOLDING-NEXT: vsext.vf2 v9, v10 -; NO_FOLDING-NEXT: vmul.vv v8, v11, v8 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v9 -; NO_FOLDING-NEXT: vsub.vv v9, v11, v9 -; NO_FOLDING-NEXT: vor.vv v8, v8, v10 -; NO_FOLDING-NEXT: vor.vv v8, v8, v9 -; NO_FOLDING-NEXT: ret +; NO_FOLDING1-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING1: # %bb.0: +; NO_FOLDING1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING1-NEXT: vle8.v v8, (a0) +; NO_FOLDING1-NEXT: vle8.v v9, (a1) +; NO_FOLDING1-NEXT: vle8.v v10, (a2) +; NO_FOLDING1-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING1-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING1-NEXT: vsext.vf2 v9, v10 +; NO_FOLDING1-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING1-NEXT: vadd.vv v10, v11, v9 +; NO_FOLDING1-NEXT: vsub.vv v9, v11, v9 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v10 +; NO_FOLDING1-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING1-NEXT: ret +; +; NO_FOLDING2-LABEL: vwmul_v2i16_multiple_users: +; NO_FOLDING2: # %bb.0: +; NO_FOLDING2-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vle8.v v8, (a0) +; NO_FOLDING2-NEXT: vle8.v v9, (a1) +; NO_FOLDING2-NEXT: vle8.v v10, (a2) +; NO_FOLDING2-NEXT: vsext.vf2 v11, v8 +; NO_FOLDING2-NEXT: vsext.vf2 v8, v9 +; NO_FOLDING2-NEXT: vmul.vv v8, v11, v8 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; NO_FOLDING2-NEXT: vwadd.wv v9, v11, v10 +; NO_FOLDING2-NEXT: vwsub.wv v11, v11, v10 +; NO_FOLDING2-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; NO_FOLDING2-NEXT: vor.vv v8, v8, v9 +; NO_FOLDING2-NEXT: vor.vv v8, v8, v11 +; NO_FOLDING2-NEXT: ret ; ; FOLDING-LABEL: vwmul_v2i16_multiple_users: ; FOLDING: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index 2d1b7fcbf0239..ce03f8fad4a19 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -28,8 +28,9 @@ define i64 @PR62286(i32 %a) { ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -58,12 +59,13 @@ define i64 @PR62286(i32 %a) { ; AVX512-LABEL: PR62286: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT: movw $4369, %ax # imm = 0x1111 +; AVX512-NEXT: movb $8, %al ; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: vpaddd %zmm0, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxdq %ymm1, %zmm0 +; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index efa6c16fbf4eb..91d3bdc68434a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3572,68 +3572,78 @@ define void @SpinningCube() { ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSE2-NEXT: addps %xmm0, %xmm3 +; SSE2-NEXT: movaps %xmm3, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSSE3-NEXT: addps %xmm0, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] -; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 -; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: insertps {{.*#+}} xmm1 = zero,zero,zero,xmm0[0] +; SSE41-NEXT: movaps {{.*#+}} xmm2 = [0.0E+0,0.0E+0,-2.0E+0,u] +; SSE41-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: movaps %xmm2, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE41-NEXT: addps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],zero,xmm4[3] ; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rax) ; AVX-NEXT: vbroadcastss (%rax), %xmm2