@@ -10913,9 +10913,48 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
1091310913 Cmp.getValue(1));
1091410914}
1091510915
10916+ /// Emit vector comparison for floating-point values, producing a mask.
10917+ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
10918+ AArch64CC::CondCode CC, bool NoNans, EVT VT,
10919+ const SDLoc &dl, SelectionDAG &DAG) {
10920+ EVT SrcVT = LHS.getValueType();
10921+ assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10922+ "function only supposed to emit natural comparisons");
10923+
10924+ switch (CC) {
10925+ default:
10926+ return SDValue();
10927+ case AArch64CC::NE: {
10928+ SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10929+ return DAG.getNOT(dl, Fcmeq, VT);
10930+ }
10931+ case AArch64CC::EQ:
10932+ return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10933+ case AArch64CC::GE:
10934+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10935+ case AArch64CC::GT:
10936+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10937+ case AArch64CC::LE:
10938+ if (!NoNans)
10939+ return SDValue();
10940+ // If we ignore NaNs then we can use to the LS implementation.
10941+ [[fallthrough]];
10942+ case AArch64CC::LS:
10943+ return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10944+ case AArch64CC::LT:
10945+ if (!NoNans)
10946+ return SDValue();
10947+ // If we ignore NaNs then we can use to the MI implementation.
10948+ [[fallthrough]];
10949+ case AArch64CC::MI:
10950+ return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10951+ }
10952+ }
10953+
1091610954SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
1091710955 SDValue RHS, SDValue TVal,
10918- SDValue FVal, const SDLoc &dl,
10956+ SDValue FVal, bool HasNoNaNs,
10957+ const SDLoc &dl,
1091910958 SelectionDAG &DAG) const {
1092010959 // Handle f128 first, because it will result in a comparison of some RTLIB
1092110960 // call result against zero.
@@ -11099,6 +11138,29 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
1109911138 LHS.getValueType() == MVT::f64);
1110011139 assert(LHS.getValueType() == RHS.getValueType());
1110111140 EVT VT = TVal.getValueType();
11141+
11142+ // If the purpose of the comparison is to select between all ones
11143+ // or all zeros, use a vector comparison because the operands are already
11144+ // stored in SIMD registers.
11145+ auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11146+ auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11147+ if (Subtarget->isNeonAvailable() &&
11148+ (VT.getSizeInBits() == LHS.getValueType().getSizeInBits()) && CTVal &&
11149+ CFVal &&
11150+ ((CTVal->isAllOnes() && CFVal->isZero()) ||
11151+ ((CTVal->isZero()) && CFVal->isAllOnes()))) {
11152+ AArch64CC::CondCode CC1;
11153+ AArch64CC::CondCode CC2;
11154+ bool ShouldInvert = false;
11155+ changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11156+ if (CTVal->isZero() ^ ShouldInvert)
11157+ std::swap(TVal, FVal);
11158+ bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
11159+ SDValue Res = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, dl, DAG);
11160+ if (Res)
11161+ return Res;
11162+ }
11163+
1110211164 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1110311165
1110411166 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
@@ -11185,15 +11247,17 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
1118511247 SDValue RHS = Op.getOperand(1);
1118611248 SDValue TVal = Op.getOperand(2);
1118711249 SDValue FVal = Op.getOperand(3);
11250+ bool HasNoNans = Op->getFlags().hasNoNaNs();
1118811251 SDLoc DL(Op);
11189- return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11252+ return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, HasNoNans, DL, DAG);
1119011253}
1119111254
1119211255SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
1119311256 SelectionDAG &DAG) const {
1119411257 SDValue CCVal = Op->getOperand(0);
1119511258 SDValue TVal = Op->getOperand(1);
1119611259 SDValue FVal = Op->getOperand(2);
11260+ bool HasNoNans = Op->getFlags().hasNoNaNs();
1119711261 SDLoc DL(Op);
1119811262
1119911263 EVT Ty = Op.getValueType();
@@ -11260,7 +11324,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
1126011324 DAG.getUNDEF(MVT::f32), FVal);
1126111325 }
1126211326
11263- SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11327+ SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, HasNoNans, DL, DAG);
1126411328
1126511329 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
1126611330 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -15513,47 +15577,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
1551315577 llvm_unreachable("unexpected shift opcode");
1551415578}
1551515579
15516- static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
15517- AArch64CC::CondCode CC, bool NoNans, EVT VT,
15518- const SDLoc &dl, SelectionDAG &DAG) {
15519- EVT SrcVT = LHS.getValueType();
15520- assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15521- "function only supposed to emit natural comparisons");
15522-
15523- if (SrcVT.getVectorElementType().isFloatingPoint()) {
15524- switch (CC) {
15525- default:
15526- return SDValue();
15527- case AArch64CC::NE: {
15528- SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15529- return DAG.getNOT(dl, Fcmeq, VT);
15530- }
15531- case AArch64CC::EQ:
15532- return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15533- case AArch64CC::GE:
15534- return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15535- case AArch64CC::GT:
15536- return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15537- case AArch64CC::LE:
15538- if (!NoNans)
15539- return SDValue();
15540- // If we ignore NaNs then we can use to the LS implementation.
15541- [[fallthrough]];
15542- case AArch64CC::LS:
15543- return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15544- case AArch64CC::LT:
15545- if (!NoNans)
15546- return SDValue();
15547- // If we ignore NaNs then we can use to the MI implementation.
15548- [[fallthrough]];
15549- case AArch64CC::MI:
15550- return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15551- }
15552- }
15553-
15554- return SDValue();
15555- }
15556-
1555715580SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
1555815581 SelectionDAG &DAG) const {
1555915582 if (Op.getValueType().isScalableVector())
@@ -25374,6 +25397,28 @@ static SDValue performDUPCombine(SDNode *N,
2537425397 }
2537525398
2537625399 if (N->getOpcode() == AArch64ISD::DUP) {
25400+ // If the instruction is known to produce a scalar in SIMD registers, we can
25401+ // can duplicate it across the vector lanes using DUPLANE instead of moving
25402+ // it to a GPR first. For example, this allows us to handle:
25403+ // v4i32 = DUP (i32 (FCMGT (f32, f32)))
25404+ SDValue Op = N->getOperand(0);
25405+ // FIXME: Ideally, we should be able to handle all instructions that
25406+ // produce a scalar value in FPRs.
25407+ if (Op.getOpcode() == AArch64ISD::FCMEQ ||
25408+ Op.getOpcode() == AArch64ISD::FCMGE ||
25409+ Op.getOpcode() == AArch64ISD::FCMGT) {
25410+ EVT ElemVT = VT.getVectorElementType();
25411+ EVT ExpandedVT = VT;
25412+ // Insert into a 128-bit vector to match DUPLANE's pattern.
25413+ if (VT.getSizeInBits() != 128)
25414+ ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
25415+ 128 / ElemVT.getSizeInBits());
25416+ SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
25417+ SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
25418+ DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
25419+ return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
25420+ }
25421+
2537725422 if (DCI.isAfterLegalizeDAG()) {
2537825423 // If scalar dup's operand is extract_vector_elt, try to combine them into
2537925424 // duplane. For example,
0 commit comments