Skip to content

Commit c68c342

Browse files
committed
[AArch64] Improve lowering of scalar abs(sub(a, b)).
This patch avoids a comparison against zero when lowering abs(sub(a, b)) patterns, instead reusing the condition codes generated by a subs of the operands directly. For example, currently: ``` sxtb w8, w0 sub w8, w8, w1, sxtb cmp w8, #0 cneg w0, w8, pl ``` becomes: ``` sxtb w8, w0 sxtb w9, w1 subs w8, w9, w8 cneg w0, w8, gt ``` Whilst this doesn't decrease the number of instructions utilised, the new version exposes more ILP and uses ``cheaper'' instructions, typically having lower latency and/or higher throughput. This patch also includes a somewhat orthogonal change in performNegCSelCombine, which I included to avoid a code generation regression in CodeGen/AArch64/abd[su]-neg.ll due to the combine negating the operands of the csel, leading to an extra sub instruction. If preferable, I can open a separate PR for this.
1 parent fc59761 commit c68c342

File tree

5 files changed

+128
-113
lines changed

5 files changed

+128
-113
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7118,12 +7118,21 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
71187118
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
71197119

71207120
SDLoc DL(Op);
7121-
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
7122-
Op.getOperand(0));
7123-
// Generate SUBS & CSEL.
7124-
SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7125-
Op.getOperand(0), DAG.getConstant(0, DL, VT));
7126-
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7121+
SDValue Val = Op.getOperand(0);
7122+
SDValue Neg = DAG.getNegative(Val, DL, VT);
7123+
SDValue Cmp;
7124+
7125+
// For abs(sub(lhs, rhs)), we can compare lhs and rhs directly. This allows
7126+
// reusing the subs operation for the calculation and comparison.
7127+
if (Val.getOpcode() == ISD::SUB)
7128+
Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7129+
Val.getOperand(0), Val.getOperand(1));
7130+
else
7131+
// Otherwise, compare with zero.
7132+
Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Val,
7133+
DAG.getConstant(0, DL, VT));
7134+
7135+
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Val, Neg,
71277136
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
71287137
Cmp.getValue(1));
71297138
}
@@ -20851,11 +20860,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
2085120860
if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
2085220861
return SDValue();
2085320862

20854-
SDValue N0N = getNegatedInteger(N0, DAG);
20855-
SDValue N1N = getNegatedInteger(N1, DAG);
20856-
2085720863
SDLoc DL(N);
2085820864
EVT VT = CSel.getValueType();
20865+
20866+
// If the operands are negations of each other, reverse them.
20867+
if ((isNegatedInteger(N0) && N0.getOperand(1) == N1) ||
20868+
(isNegatedInteger(N1) && N1.getOperand(1) == N0))
20869+
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N1, N0, CSel.getOperand(2),
20870+
CSel.getOperand(3));
20871+
20872+
SDValue N0N = getNegatedInteger(N0, DAG);
20873+
SDValue N1N = getNegatedInteger(N1, DAG);
2085920874
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
2086020875
CSel.getOperand(3));
2086120876
}

llvm/test/CodeGen/AArch64/abds-neg.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
99
; CHECK-LABEL: abd_ext_i8:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: sxtb w8, w0
12-
; CHECK-NEXT: sub w8, w8, w1, sxtb
13-
; CHECK-NEXT: cmp w8, #0
11+
; CHECK-NEXT: sxtb w8, w1
12+
; CHECK-NEXT: sxtb w9, w0
13+
; CHECK-NEXT: subs w8, w9, w8
1414
; CHECK-NEXT: cneg w0, w8, pl
1515
; CHECK-NEXT: ret
1616
%aext = sext i8 %a to i64
@@ -25,9 +25,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
2525
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
2626
; CHECK-LABEL: abd_ext_i8_i16:
2727
; CHECK: // %bb.0:
28-
; CHECK-NEXT: sxtb w8, w0
29-
; CHECK-NEXT: sub w8, w8, w1, sxth
30-
; CHECK-NEXT: cmp w8, #0
28+
; CHECK-NEXT: sxth w8, w1
29+
; CHECK-NEXT: sxtb w9, w0
30+
; CHECK-NEXT: subs w8, w9, w8
3131
; CHECK-NEXT: cneg w0, w8, pl
3232
; CHECK-NEXT: ret
3333
%aext = sext i8 %a to i64
@@ -42,9 +42,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
4242
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
4343
; CHECK-LABEL: abd_ext_i8_undef:
4444
; CHECK: // %bb.0:
45-
; CHECK-NEXT: sxtb w8, w0
46-
; CHECK-NEXT: sub w8, w8, w1, sxtb
47-
; CHECK-NEXT: cmp w8, #0
45+
; CHECK-NEXT: sxtb w8, w1
46+
; CHECK-NEXT: sxtb w9, w0
47+
; CHECK-NEXT: subs w8, w9, w8
4848
; CHECK-NEXT: cneg w0, w8, pl
4949
; CHECK-NEXT: ret
5050
%aext = sext i8 %a to i64
@@ -59,9 +59,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
5959
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
6060
; CHECK-LABEL: abd_ext_i16:
6161
; CHECK: // %bb.0:
62-
; CHECK-NEXT: sxth w8, w0
63-
; CHECK-NEXT: sub w8, w8, w1, sxth
64-
; CHECK-NEXT: cmp w8, #0
62+
; CHECK-NEXT: sxth w8, w1
63+
; CHECK-NEXT: sxth w9, w0
64+
; CHECK-NEXT: subs w8, w9, w8
6565
; CHECK-NEXT: cneg w0, w8, pl
6666
; CHECK-NEXT: ret
6767
%aext = sext i16 %a to i64
@@ -92,9 +92,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
9292
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
9393
; CHECK-LABEL: abd_ext_i16_undef:
9494
; CHECK: // %bb.0:
95-
; CHECK-NEXT: sxth w8, w0
96-
; CHECK-NEXT: sub w8, w8, w1, sxth
97-
; CHECK-NEXT: cmp w8, #0
95+
; CHECK-NEXT: sxth w8, w1
96+
; CHECK-NEXT: sxth w9, w0
97+
; CHECK-NEXT: subs w8, w9, w8
9898
; CHECK-NEXT: cneg w0, w8, pl
9999
; CHECK-NEXT: ret
100100
%aext = sext i16 %a to i64

llvm/test/CodeGen/AArch64/abds.ll

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
99
; CHECK-LABEL: abd_ext_i8:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: sxtb w8, w0
12-
; CHECK-NEXT: sub w8, w8, w1, sxtb
13-
; CHECK-NEXT: cmp w8, #0
11+
; CHECK-NEXT: sxtb w8, w1
12+
; CHECK-NEXT: sxtb w9, w0
13+
; CHECK-NEXT: subs w8, w9, w8
1414
; CHECK-NEXT: cneg w0, w8, mi
1515
; CHECK-NEXT: ret
1616
%aext = sext i8 %a to i64
@@ -24,9 +24,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
2424
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
2525
; CHECK-LABEL: abd_ext_i8_i16:
2626
; CHECK: // %bb.0:
27-
; CHECK-NEXT: sxtb w8, w0
28-
; CHECK-NEXT: sub w8, w8, w1, sxth
29-
; CHECK-NEXT: cmp w8, #0
27+
; CHECK-NEXT: sxth w8, w1
28+
; CHECK-NEXT: sxtb w9, w0
29+
; CHECK-NEXT: subs w8, w9, w8
3030
; CHECK-NEXT: cneg w0, w8, mi
3131
; CHECK-NEXT: ret
3232
%aext = sext i8 %a to i64
@@ -40,9 +40,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
4040
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
4141
; CHECK-LABEL: abd_ext_i8_undef:
4242
; CHECK: // %bb.0:
43-
; CHECK-NEXT: sxtb w8, w0
44-
; CHECK-NEXT: sub w8, w8, w1, sxtb
45-
; CHECK-NEXT: cmp w8, #0
43+
; CHECK-NEXT: sxtb w8, w1
44+
; CHECK-NEXT: sxtb w9, w0
45+
; CHECK-NEXT: subs w8, w9, w8
4646
; CHECK-NEXT: cneg w0, w8, mi
4747
; CHECK-NEXT: ret
4848
%aext = sext i8 %a to i64
@@ -56,9 +56,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
5656
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
5757
; CHECK-LABEL: abd_ext_i16:
5858
; CHECK: // %bb.0:
59-
; CHECK-NEXT: sxth w8, w0
60-
; CHECK-NEXT: sub w8, w8, w1, sxth
61-
; CHECK-NEXT: cmp w8, #0
59+
; CHECK-NEXT: sxth w8, w1
60+
; CHECK-NEXT: sxth w9, w0
61+
; CHECK-NEXT: subs w8, w9, w8
6262
; CHECK-NEXT: cneg w0, w8, mi
6363
; CHECK-NEXT: ret
6464
%aext = sext i16 %a to i64
@@ -87,9 +87,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
8787
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
8888
; CHECK-LABEL: abd_ext_i16_undef:
8989
; CHECK: // %bb.0:
90-
; CHECK-NEXT: sxth w8, w0
91-
; CHECK-NEXT: sub w8, w8, w1, sxth
92-
; CHECK-NEXT: cmp w8, #0
90+
; CHECK-NEXT: sxth w8, w1
91+
; CHECK-NEXT: sxth w9, w0
92+
; CHECK-NEXT: subs w8, w9, w8
9393
; CHECK-NEXT: cneg w0, w8, mi
9494
; CHECK-NEXT: ret
9595
%aext = sext i16 %a to i64
@@ -214,9 +214,9 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
214214
define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
215215
; CHECK-LABEL: abd_minmax_i8:
216216
; CHECK: // %bb.0:
217-
; CHECK-NEXT: sxtb w8, w0
218-
; CHECK-NEXT: sub w8, w8, w1, sxtb
219-
; CHECK-NEXT: cmp w8, #0
217+
; CHECK-NEXT: sxtb w8, w1
218+
; CHECK-NEXT: sxtb w9, w0
219+
; CHECK-NEXT: subs w8, w9, w8
220220
; CHECK-NEXT: cneg w0, w8, mi
221221
; CHECK-NEXT: ret
222222
%min = call i8 @llvm.smin.i8(i8 %a, i8 %b)
@@ -228,9 +228,9 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
228228
define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
229229
; CHECK-LABEL: abd_minmax_i16:
230230
; CHECK: // %bb.0:
231-
; CHECK-NEXT: sxth w8, w0
232-
; CHECK-NEXT: sub w8, w8, w1, sxth
233-
; CHECK-NEXT: cmp w8, #0
231+
; CHECK-NEXT: sxth w8, w1
232+
; CHECK-NEXT: sxth w9, w0
233+
; CHECK-NEXT: subs w8, w9, w8
234234
; CHECK-NEXT: cneg w0, w8, mi
235235
; CHECK-NEXT: ret
236236
%min = call i16 @llvm.smin.i16(i16 %a, i16 %b)
@@ -286,9 +286,9 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
286286
define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
287287
; CHECK-LABEL: abd_cmp_i8:
288288
; CHECK: // %bb.0:
289-
; CHECK-NEXT: sxtb w8, w0
290-
; CHECK-NEXT: sub w8, w8, w1, sxtb
291-
; CHECK-NEXT: cmp w8, #0
289+
; CHECK-NEXT: sxtb w8, w1
290+
; CHECK-NEXT: sxtb w9, w0
291+
; CHECK-NEXT: subs w8, w9, w8
292292
; CHECK-NEXT: cneg w0, w8, mi
293293
; CHECK-NEXT: ret
294294
%cmp = icmp sgt i8 %a, %b
@@ -301,9 +301,9 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
301301
define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
302302
; CHECK-LABEL: abd_cmp_i16:
303303
; CHECK: // %bb.0:
304-
; CHECK-NEXT: sxth w8, w0
305-
; CHECK-NEXT: sub w8, w8, w1, sxth
306-
; CHECK-NEXT: cmp w8, #0
304+
; CHECK-NEXT: sxth w8, w1
305+
; CHECK-NEXT: sxth w9, w0
306+
; CHECK-NEXT: subs w8, w9, w8
307307
; CHECK-NEXT: cneg w0, w8, mi
308308
; CHECK-NEXT: ret
309309
%cmp = icmp sge i16 %a, %b
@@ -507,11 +507,11 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
507507
; CHECK-LABEL: vector_legalized:
508508
; CHECK: // %bb.0:
509509
; CHECK-NEXT: movi v0.2d, #0000000000000000
510-
; CHECK-NEXT: sxth w8, w0
511-
; CHECK-NEXT: sub w8, w8, w1, sxth
512-
; CHECK-NEXT: addp d0, v0.2d
513-
; CHECK-NEXT: cmp w8, #0
510+
; CHECK-NEXT: sxth w8, w1
511+
; CHECK-NEXT: sxth w9, w0
512+
; CHECK-NEXT: subs w8, w9, w8
514513
; CHECK-NEXT: cneg w8, w8, mi
514+
; CHECK-NEXT: addp d0, v0.2d
515515
; CHECK-NEXT: fmov x9, d0
516516
; CHECK-NEXT: add x0, x9, x8
517517
; CHECK-NEXT: ret
@@ -532,9 +532,9 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
532532
define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
533533
; CHECK-LABEL: abd_select_i8:
534534
; CHECK: // %bb.0:
535-
; CHECK-NEXT: sxtb w8, w0
536-
; CHECK-NEXT: sub w8, w8, w1, sxtb
537-
; CHECK-NEXT: cmp w8, #0
535+
; CHECK-NEXT: sxtb w8, w1
536+
; CHECK-NEXT: sxtb w9, w0
537+
; CHECK-NEXT: subs w8, w9, w8
538538
; CHECK-NEXT: cneg w0, w8, mi
539539
; CHECK-NEXT: ret
540540
%cmp = icmp slt i8 %a, %b
@@ -547,9 +547,9 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
547547
define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
548548
; CHECK-LABEL: abd_select_i16:
549549
; CHECK: // %bb.0:
550-
; CHECK-NEXT: sxth w8, w0
551-
; CHECK-NEXT: sub w8, w8, w1, sxth
552-
; CHECK-NEXT: cmp w8, #0
550+
; CHECK-NEXT: sxth w8, w1
551+
; CHECK-NEXT: sxth w9, w0
552+
; CHECK-NEXT: subs w8, w9, w8
553553
; CHECK-NEXT: cneg w0, w8, mi
554554
; CHECK-NEXT: ret
555555
%cmp = icmp sle i16 %a, %b

llvm/test/CodeGen/AArch64/abdu-neg.ll

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
99
; CHECK-LABEL: abd_ext_i8:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: and w8, w0, #0xff
12-
; CHECK-NEXT: sub w8, w8, w1, uxtb
13-
; CHECK-NEXT: cmp w8, #0
11+
; CHECK-NEXT: and w8, w1, #0xff
12+
; CHECK-NEXT: and w9, w0, #0xff
13+
; CHECK-NEXT: subs w8, w9, w8
1414
; CHECK-NEXT: cneg w0, w8, pl
1515
; CHECK-NEXT: ret
1616
%aext = zext i8 %a to i64
@@ -25,9 +25,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
2525
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
2626
; CHECK-LABEL: abd_ext_i8_i16:
2727
; CHECK: // %bb.0:
28-
; CHECK-NEXT: and w8, w0, #0xff
29-
; CHECK-NEXT: sub w8, w8, w1, uxth
30-
; CHECK-NEXT: cmp w8, #0
28+
; CHECK-NEXT: and w8, w1, #0xffff
29+
; CHECK-NEXT: and w9, w0, #0xff
30+
; CHECK-NEXT: subs w8, w9, w8
3131
; CHECK-NEXT: cneg w0, w8, pl
3232
; CHECK-NEXT: ret
3333
%aext = zext i8 %a to i64
@@ -42,9 +42,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
4242
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
4343
; CHECK-LABEL: abd_ext_i8_undef:
4444
; CHECK: // %bb.0:
45-
; CHECK-NEXT: and w8, w0, #0xff
46-
; CHECK-NEXT: sub w8, w8, w1, uxtb
47-
; CHECK-NEXT: cmp w8, #0
45+
; CHECK-NEXT: and w8, w1, #0xff
46+
; CHECK-NEXT: and w9, w0, #0xff
47+
; CHECK-NEXT: subs w8, w9, w8
4848
; CHECK-NEXT: cneg w0, w8, pl
4949
; CHECK-NEXT: ret
5050
%aext = zext i8 %a to i64
@@ -59,9 +59,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
5959
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
6060
; CHECK-LABEL: abd_ext_i16:
6161
; CHECK: // %bb.0:
62-
; CHECK-NEXT: and w8, w0, #0xffff
63-
; CHECK-NEXT: sub w8, w8, w1, uxth
64-
; CHECK-NEXT: cmp w8, #0
62+
; CHECK-NEXT: and w8, w1, #0xffff
63+
; CHECK-NEXT: and w9, w0, #0xffff
64+
; CHECK-NEXT: subs w8, w9, w8
6565
; CHECK-NEXT: cneg w0, w8, pl
6666
; CHECK-NEXT: ret
6767
%aext = zext i16 %a to i64
@@ -92,9 +92,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
9292
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
9393
; CHECK-LABEL: abd_ext_i16_undef:
9494
; CHECK: // %bb.0:
95-
; CHECK-NEXT: and w8, w0, #0xffff
96-
; CHECK-NEXT: sub w8, w8, w1, uxth
97-
; CHECK-NEXT: cmp w8, #0
95+
; CHECK-NEXT: and w8, w1, #0xffff
96+
; CHECK-NEXT: and w9, w0, #0xffff
97+
; CHECK-NEXT: subs w8, w9, w8
9898
; CHECK-NEXT: cneg w0, w8, pl
9999
; CHECK-NEXT: ret
100100
%aext = zext i16 %a to i64

0 commit comments

Comments
 (0)