Skip to content

Commit 4c853be

Browse files
authored
[AArch64] Replace uaddlv with addv for popcount operation (#121934)
Replace `uaddlv` with `addv` for popcount operation as it is simpler operation. On certain platforms like Cortex-A510, `addv` has a latency of 3 cycles whereas `uaddlv` has a latency of 4 cycles GCC generates `addv` as well: https://godbolt.org/z/MnYG9jcEo
1 parent eb63cd6 commit 4c853be

File tree

5 files changed

+66
-69
lines changed

5 files changed

+66
-69
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10764,37 +10764,30 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1076410764
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1076510765
// CNT V0.8B, V0.8B // 8xbyte pop-counts
1076610766
// ADDV B0, V0.8B // sum 8xbyte pop-counts
10767-
// UMOV X0, V0.B[0] // copy byte result back to integer reg
10767+
// FMOV X0, D0 // copy result back to integer reg
1076810768
if (VT == MVT::i32 || VT == MVT::i64) {
1076910769
if (VT == MVT::i32)
1077010770
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
1077110771
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
1077210772

1077310773
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10774-
SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10775-
UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10774+
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10775+
if (VT == MVT::i32)
10776+
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
1077610777
DAG.getConstant(0, DL, MVT::i64));
10777-
10778+
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
1077810779
if (IsParity)
10779-
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10780-
DAG.getConstant(1, DL, MVT::i32));
10781-
10782-
if (VT == MVT::i64)
10783-
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
10784-
return UaddLV;
10780+
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10781+
return AddV;
1078510782
} else if (VT == MVT::i128) {
1078610783
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
1078710784

1078810785
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10789-
SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10790-
UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10791-
DAG.getConstant(0, DL, MVT::i64));
10792-
10786+
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10787+
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
1079310788
if (IsParity)
10794-
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10795-
DAG.getConstant(1, DL, MVT::i32));
10796-
10797-
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
10789+
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10790+
return AddV;
1079810791
}
1079910792

1080010793
assert(!IsParity && "ISD::PARITY of vector types not supported");

llvm/test/CodeGen/AArch64/arm64-popcnt.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
88
; CHECK: // %bb.0:
99
; CHECK-NEXT: fmov s0, w0
1010
; CHECK-NEXT: cnt.8b v0, v0
11-
; CHECK-NEXT: uaddlv.8b h0, v0
11+
; CHECK-NEXT: addv.8b b0, v0
1212
; CHECK-NEXT: fmov w0, s0
1313
; CHECK-NEXT: ret
1414
;
@@ -43,7 +43,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
4343
; CHECK-NEXT: fmov w8, s0
4444
; CHECK-NEXT: fmov s0, w8
4545
; CHECK-NEXT: cnt.8b v0, v0
46-
; CHECK-NEXT: uaddlv.8b h0, v0
46+
; CHECK-NEXT: addv.8b b0, v0
4747
; CHECK-NEXT: fmov w0, s0
4848
; CHECK-NEXT: ret
4949
;
@@ -79,8 +79,8 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
7979
; CHECK: // %bb.0:
8080
; CHECK-NEXT: fmov d0, x0
8181
; CHECK-NEXT: cnt.8b v0, v0
82-
; CHECK-NEXT: uaddlv.8b h0, v0
83-
; CHECK-NEXT: fmov w0, s0
82+
; CHECK-NEXT: addv.8b b0, v0
83+
; CHECK-NEXT: fmov x0, d0
8484
; CHECK-NEXT: ret
8585
;
8686
; CHECK-NONEON-LABEL: cnt64_advsimd:

llvm/test/CodeGen/AArch64/dp1.ll

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -197,52 +197,58 @@ define void @cttz_zeroundef_i64() {
197197
}
198198

199199
define void @ctpop_i32() {
200-
; CHECK-LABEL: ctpop_i32:
201-
; CHECK: // %bb.0:
202-
; CHECK-NEXT: adrp x8, :got:var32
203-
; CHECK-NEXT: ldr x8, [x8, :got_lo12:var32]
204-
; CHECK-NEXT: ldr w9, [x8]
205-
; CHECK-NEXT: fmov d0, x9
206-
; CHECK-NEXT: cnt v0.8b, v0.8b
207-
; CHECK-NEXT: uaddlv h0, v0.8b
208-
; CHECK-NEXT: str s0, [x8]
209-
; CHECK-NEXT: ret
200+
; CHECK-SDAG-LABEL: ctpop_i32:
201+
; CHECK-SDAG: // %bb.0:
202+
; CHECK-SDAG-NEXT: adrp x8, :got:var32
203+
; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var32]
204+
; CHECK-SDAG-NEXT: ldr w9, [x8]
205+
; CHECK-SDAG-NEXT: fmov d0, x9
206+
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
207+
; CHECK-SDAG-NEXT: addv b0, v0.8b
208+
; CHECK-SDAG-NEXT: fmov w9, s0
209+
; CHECK-SDAG-NEXT: str w9, [x8]
210+
; CHECK-SDAG-NEXT: ret
211+
;
212+
; CHECK-GISEL-LABEL: ctpop_i32:
213+
; CHECK-GISEL: // %bb.0:
214+
; CHECK-GISEL-NEXT: adrp x8, :got:var32
215+
; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var32]
216+
; CHECK-GISEL-NEXT: ldr w9, [x8]
217+
; CHECK-GISEL-NEXT: fmov d0, x9
218+
; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
219+
; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
220+
; CHECK-GISEL-NEXT: str s0, [x8]
221+
; CHECK-GISEL-NEXT: ret
210222
%val0_tmp = load i32, ptr @var32
211223
%val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
212224
store volatile i32 %val4_tmp, ptr @var32
213225
ret void
214226
}
215227

216-
define void @ctpop_i64() {
217-
; CHECK-SDAG-LABEL: ctpop_i64:
228+
define i64 @popcnt(i64 %a, ptr %p) {
229+
; CHECK-SDAG-LABEL: popcnt:
218230
; CHECK-SDAG: // %bb.0:
219-
; CHECK-SDAG-NEXT: adrp x8, :got:var64
220-
; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var64]
221-
; CHECK-SDAG-NEXT: ldr d0, [x8]
231+
; CHECK-SDAG-NEXT: fmov d0, x0
232+
; CHECK-SDAG-NEXT: mov x0, xzr
222233
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
223-
; CHECK-SDAG-NEXT: uaddlv h0, v0.8b
224-
; CHECK-SDAG-NEXT: fmov w9, s0
225-
; CHECK-SDAG-NEXT: str x9, [x8]
234+
; CHECK-SDAG-NEXT: addv b0, v0.8b
235+
; CHECK-SDAG-NEXT: str d0, [x1]
226236
; CHECK-SDAG-NEXT: ret
227237
;
228-
; CHECK-GISEL-LABEL: ctpop_i64:
238+
; CHECK-GISEL-LABEL: popcnt:
229239
; CHECK-GISEL: // %bb.0:
230-
; CHECK-GISEL-NEXT: adrp x8, :got:var64
231-
; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var64]
232-
; CHECK-GISEL-NEXT: ldr x9, [x8]
233-
; CHECK-GISEL-NEXT: fmov d0, x9
240+
; CHECK-GISEL-NEXT: fmov d0, x0
241+
; CHECK-GISEL-NEXT: mov x0, xzr
234242
; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
235243
; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
236-
; CHECK-GISEL-NEXT: mov w9, v0.s[0]
237-
; CHECK-GISEL-NEXT: str x9, [x8]
244+
; CHECK-GISEL-NEXT: mov w8, v0.s[0]
245+
; CHECK-GISEL-NEXT: str x8, [x1]
238246
; CHECK-GISEL-NEXT: ret
239-
%val0_tmp = load i64, ptr @var64
240-
%val4_tmp = call i64 @llvm.ctpop.i64(i64 %val0_tmp)
241-
store volatile i64 %val4_tmp, ptr @var64
242-
ret void
247+
%2 = call i64 @llvm.ctpop(i64 %a)
248+
store i64 %2, ptr %p
249+
ret i64 0
243250
}
244251

245-
246252
declare i32 @llvm.bswap.i32(i32)
247253
declare i64 @llvm.bswap.i64(i64)
248254
declare i32 @llvm.ctlz.i32 (i32, i1)

llvm/test/CodeGen/AArch64/parity.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,9 @@ define i64 @parity_64(i64 %x) {
114114
; CHECK: // %bb.0:
115115
; CHECK-NEXT: fmov d0, x0
116116
; CHECK-NEXT: cnt v0.8b, v0.8b
117-
; CHECK-NEXT: uaddlv h0, v0.8b
118-
; CHECK-NEXT: fmov w8, s0
119-
; CHECK-NEXT: and w0, w8, #0x1
117+
; CHECK-NEXT: addv b0, v0.8b
118+
; CHECK-NEXT: fmov x8, d0
119+
; CHECK-NEXT: and x0, x8, #0x1
120120
; CHECK-NEXT: ret
121121
;
122122
; CHECK-CSSC-LABEL: parity_64:
@@ -136,9 +136,9 @@ define i128 @parity_128(i128 %x) {
136136
; CHECK-NEXT: mov v0.d[1], x1
137137
; CHECK-NEXT: mov x1, xzr
138138
; CHECK-NEXT: cnt v0.16b, v0.16b
139-
; CHECK-NEXT: uaddlv h0, v0.16b
140-
; CHECK-NEXT: fmov w8, s0
141-
; CHECK-NEXT: and w0, w8, #0x1
139+
; CHECK-NEXT: addv b0, v0.16b
140+
; CHECK-NEXT: fmov x8, d0
141+
; CHECK-NEXT: and x0, x8, #0x1
142142
; CHECK-NEXT: ret
143143
;
144144
; CHECK-CSSC-LABEL: parity_128:
@@ -158,8 +158,8 @@ define i32 @parity_64_trunc(i64 %x) {
158158
; CHECK: // %bb.0:
159159
; CHECK-NEXT: fmov d0, x0
160160
; CHECK-NEXT: cnt v0.8b, v0.8b
161-
; CHECK-NEXT: uaddlv h0, v0.8b
162-
; CHECK-NEXT: fmov w8, s0
161+
; CHECK-NEXT: addv b0, v0.8b
162+
; CHECK-NEXT: fmov x8, d0
163163
; CHECK-NEXT: and w0, w8, #0x1
164164
; CHECK-NEXT: ret
165165
;

llvm/test/CodeGen/AArch64/popcount.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
2828
; CHECK-NEXT: add x8, x0, #8
2929
; CHECK-NEXT: ld1 { v0.d }[1], [x8]
3030
; CHECK-NEXT: cnt v0.16b, v0.16b
31-
; CHECK-NEXT: uaddlv h0, v0.16b
31+
; CHECK-NEXT: addv b0, v0.16b
3232
; CHECK-NEXT: fmov w0, s0
3333
; CHECK-NEXT: ret
3434
;
@@ -104,8 +104,8 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
104104
; CHECK-NEXT: ld1 { v1.d }[1], [x8]
105105
; CHECK-NEXT: cnt v0.16b, v0.16b
106106
; CHECK-NEXT: cnt v1.16b, v1.16b
107-
; CHECK-NEXT: uaddlv h0, v0.16b
108-
; CHECK-NEXT: uaddlv h1, v1.16b
107+
; CHECK-NEXT: addv b0, v0.16b
108+
; CHECK-NEXT: addv b1, v1.16b
109109
; CHECK-NEXT: fmov w8, s0
110110
; CHECK-NEXT: fmov w9, s1
111111
; CHECK-NEXT: add w0, w9, w8
@@ -191,12 +191,10 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
191191
;
192192
; CHECK-LABEL: popcount1x128:
193193
; CHECK: // %bb.0: // %Entry
194-
; CHECK-NEXT: fmov d1, x0
195-
; CHECK-NEXT: movi v0.2d, #0000000000000000
196-
; CHECK-NEXT: mov v1.d[1], x1
197-
; CHECK-NEXT: cnt v1.16b, v1.16b
198-
; CHECK-NEXT: uaddlv h1, v1.16b
199-
; CHECK-NEXT: mov v0.s[0], v1.s[0]
194+
; CHECK-NEXT: fmov d0, x0
195+
; CHECK-NEXT: mov v0.d[1], x1
196+
; CHECK-NEXT: cnt v0.16b, v0.16b
197+
; CHECK-NEXT: addv b0, v0.16b
200198
; CHECK-NEXT: mov x1, v0.d[1]
201199
; CHECK-NEXT: fmov x0, d0
202200
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)