Skip to content

Commit 8623154

Browse files
committed
[AArch64] Spare N2I roundtrip when splatting float comparison
Transform `select_cc t1, t2, -1, 0` for floats into a vector comparison which generates a mask, which is later on combined with potential vectorized DUPs.
1 parent 6410658 commit 8623154

File tree

4 files changed

+125
-49
lines changed

4 files changed

+125
-49
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 89 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -10913,9 +10913,48 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
1091310913
Cmp.getValue(1));
1091410914
}
1091510915

10916+
/// Emit vector comparison for floating-point values, producing a mask.
10917+
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
10918+
AArch64CC::CondCode CC, bool NoNans, EVT VT,
10919+
const SDLoc &dl, SelectionDAG &DAG) {
10920+
EVT SrcVT = LHS.getValueType();
10921+
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
10922+
"function only supposed to emit natural comparisons");
10923+
10924+
switch (CC) {
10925+
default:
10926+
return SDValue();
10927+
case AArch64CC::NE: {
10928+
SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10929+
return DAG.getNOT(dl, Fcmeq, VT);
10930+
}
10931+
case AArch64CC::EQ:
10932+
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
10933+
case AArch64CC::GE:
10934+
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
10935+
case AArch64CC::GT:
10936+
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
10937+
case AArch64CC::LE:
10938+
if (!NoNans)
10939+
return SDValue();
10940+
// If we ignore NaNs then we can use to the LS implementation.
10941+
[[fallthrough]];
10942+
case AArch64CC::LS:
10943+
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
10944+
case AArch64CC::LT:
10945+
if (!NoNans)
10946+
return SDValue();
10947+
// If we ignore NaNs then we can use to the MI implementation.
10948+
[[fallthrough]];
10949+
case AArch64CC::MI:
10950+
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
10951+
}
10952+
}
10953+
1091610954
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
1091710955
SDValue RHS, SDValue TVal,
10918-
SDValue FVal, const SDLoc &dl,
10956+
SDValue FVal, bool HasNoNaNs,
10957+
const SDLoc &dl,
1091910958
SelectionDAG &DAG) const {
1092010959
// Handle f128 first, because it will result in a comparison of some RTLIB
1092110960
// call result against zero.
@@ -11099,6 +11138,29 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
1109911138
LHS.getValueType() == MVT::f64);
1110011139
assert(LHS.getValueType() == RHS.getValueType());
1110111140
EVT VT = TVal.getValueType();
11141+
11142+
// If the purpose of the comparison is to select between all ones
11143+
// or all zeros, use a vector comparison because the operands are already
11144+
// stored in SIMD registers.
11145+
auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11146+
auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11147+
if (Subtarget->isNeonAvailable() &&
11148+
(VT.getSizeInBits() == LHS.getValueType().getSizeInBits()) && CTVal &&
11149+
CFVal &&
11150+
((CTVal->isAllOnes() && CFVal->isZero()) ||
11151+
((CTVal->isZero()) && CFVal->isAllOnes()))) {
11152+
AArch64CC::CondCode CC1;
11153+
AArch64CC::CondCode CC2;
11154+
bool ShouldInvert = false;
11155+
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11156+
if (CTVal->isZero() ^ ShouldInvert)
11157+
std::swap(TVal, FVal);
11158+
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
11159+
SDValue Res = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, dl, DAG);
11160+
if (Res)
11161+
return Res;
11162+
}
11163+
1110211164
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1110311165

1110411166
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
@@ -11185,15 +11247,17 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
1118511247
SDValue RHS = Op.getOperand(1);
1118611248
SDValue TVal = Op.getOperand(2);
1118711249
SDValue FVal = Op.getOperand(3);
11250+
bool HasNoNans = Op->getFlags().hasNoNaNs();
1118811251
SDLoc DL(Op);
11189-
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11252+
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, HasNoNans, DL, DAG);
1119011253
}
1119111254

1119211255
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
1119311256
SelectionDAG &DAG) const {
1119411257
SDValue CCVal = Op->getOperand(0);
1119511258
SDValue TVal = Op->getOperand(1);
1119611259
SDValue FVal = Op->getOperand(2);
11260+
bool HasNoNans = Op->getFlags().hasNoNaNs();
1119711261
SDLoc DL(Op);
1119811262

1119911263
EVT Ty = Op.getValueType();
@@ -11260,7 +11324,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
1126011324
DAG.getUNDEF(MVT::f32), FVal);
1126111325
}
1126211326

11263-
SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
11327+
SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, HasNoNans, DL, DAG);
1126411328

1126511329
if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
1126611330
return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -15513,47 +15577,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
1551315577
llvm_unreachable("unexpected shift opcode");
1551415578
}
1551515579

15516-
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
15517-
AArch64CC::CondCode CC, bool NoNans, EVT VT,
15518-
const SDLoc &dl, SelectionDAG &DAG) {
15519-
EVT SrcVT = LHS.getValueType();
15520-
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
15521-
"function only supposed to emit natural comparisons");
15522-
15523-
if (SrcVT.getVectorElementType().isFloatingPoint()) {
15524-
switch (CC) {
15525-
default:
15526-
return SDValue();
15527-
case AArch64CC::NE: {
15528-
SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15529-
return DAG.getNOT(dl, Fcmeq, VT);
15530-
}
15531-
case AArch64CC::EQ:
15532-
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
15533-
case AArch64CC::GE:
15534-
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
15535-
case AArch64CC::GT:
15536-
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15537-
case AArch64CC::LE:
15538-
if (!NoNans)
15539-
return SDValue();
15540-
// If we ignore NaNs then we can use to the LS implementation.
15541-
[[fallthrough]];
15542-
case AArch64CC::LS:
15543-
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15544-
case AArch64CC::LT:
15545-
if (!NoNans)
15546-
return SDValue();
15547-
// If we ignore NaNs then we can use to the MI implementation.
15548-
[[fallthrough]];
15549-
case AArch64CC::MI:
15550-
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15551-
}
15552-
}
15553-
15554-
return SDValue();
15555-
}
15556-
1555715580
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
1555815581
SelectionDAG &DAG) const {
1555915582
if (Op.getValueType().isScalableVector())
@@ -25374,6 +25397,28 @@ static SDValue performDUPCombine(SDNode *N,
2537425397
}
2537525398

2537625399
if (N->getOpcode() == AArch64ISD::DUP) {
25400+
// If the instruction is known to produce a scalar in SIMD registers, we can
25401+
// can duplicate it across the vector lanes using DUPLANE instead of moving
25402+
// it to a GPR first. For example, this allows us to handle:
25403+
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
25404+
SDValue Op = N->getOperand(0);
25405+
// FIXME: Ideally, we should be able to handle all instructions that
25406+
// produce a scalar value in FPRs.
25407+
if (Op.getOpcode() == AArch64ISD::FCMEQ ||
25408+
Op.getOpcode() == AArch64ISD::FCMGE ||
25409+
Op.getOpcode() == AArch64ISD::FCMGT) {
25410+
EVT ElemVT = VT.getVectorElementType();
25411+
EVT ExpandedVT = VT;
25412+
// Insert into a 128-bit vector to match DUPLANE's pattern.
25413+
if (VT.getSizeInBits() != 128)
25414+
ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
25415+
128 / ElemVT.getSizeInBits());
25416+
SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
25417+
SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
25418+
DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
25419+
return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
25420+
}
25421+
2537725422
if (DCI.isAfterLegalizeDAG()) {
2537825423
// If scalar dup's operand is extract_vector_elt, try to combine them into
2537925424
// duplane. For example,

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -646,8 +646,8 @@ class AArch64TargetLowering : public TargetLowering {
646646
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
647647
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
648648
SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
649-
SDValue TVal, SDValue FVal, const SDLoc &dl,
650-
SelectionDAG &DAG) const;
649+
SDValue TVal, SDValue FVal, bool HasNoNans,
650+
const SDLoc &dl, SelectionDAG &DAG) const;
651651
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
652652
SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
653653
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,8 @@ define <1 x i16> @test_select_f16_i16(half %i105, half %in, <1 x i16> %x, <1 x i
174174
; CHECK-LABEL: test_select_f16_i16:
175175
; CHECK: // %bb.0:
176176
; CHECK-NEXT: fcvt s0, h0
177-
; CHECK-NEXT: fcmp s0, s0
178-
; CHECK-NEXT: csetm w8, vs
179-
; CHECK-NEXT: dup v0.4h, w8
177+
; CHECK-NEXT: fcmgt s0, s0, s0
178+
; CHECK-NEXT: dup v0.4h, v0.h[0]
180179
; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b
181180
; CHECK-NEXT: ret
182181
%i179 = fcmp uno half %i105, zeroinitializer
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3+
4+
define <4 x float> @dup32(float %a, float %b) {
5+
; CHECK-LABEL: dup32:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcmgt s0, s0, s1
8+
; CHECK-NEXT: dup v0.4s, v0.s[0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%0 = fcmp ogt float %a, %b
12+
%vcmpd.i = sext i1 %0 to i32
13+
%vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0
14+
%1 = bitcast <4 x i32> %vecinit.i to <4 x float>
15+
%2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer
16+
ret <4 x float> %2
17+
}
18+
19+
define <2 x double> @dup64(double %a, double %b) {
20+
; CHECK-LABEL: dup64:
21+
; CHECK: // %bb.0: // %entry
22+
; CHECK-NEXT: fcmgt d0, d0, d1
23+
; CHECK-NEXT: dup v0.2d, v0.d[0]
24+
; CHECK-NEXT: ret
25+
entry:
26+
%0 = fcmp ogt double %a, %b
27+
%vcmpd.i = sext i1 %0 to i64
28+
%vecinit.i = insertelement <2 x i64> poison, i64 %vcmpd.i, i64 0
29+
%1 = bitcast <2 x i64> %vecinit.i to <2 x double>
30+
%2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer
31+
ret <2 x double> %2
32+
}

0 commit comments

Comments
 (0)