Skip to content

Commit 71bccbd

Browse files
committed
Replace uaddlv with addv for CTPOP operation as it is a simpler operation
and also saves a few cycles on certain platforms like Cortex-A510.
1 parent 93220e7 commit 71bccbd

File tree

5 files changed

+66
-69
lines changed

5 files changed

+66
-69
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10700,37 +10700,30 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
1070010700
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1070110701
// CNT V0.8B, V0.8B // 8xbyte pop-counts
1070210702
// ADDV B0, V0.8B // sum 8xbyte pop-counts
10703-
// UMOV X0, V0.B[0] // copy byte result back to integer reg
10703+
// FMOV X0, D0 // copy result back to integer reg
1070410704
if (VT == MVT::i32 || VT == MVT::i64) {
1070510705
if (VT == MVT::i32)
1070610706
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
1070710707
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
1070810708

1070910709
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10710-
SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10711-
UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10710+
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
10711+
if (VT == MVT::i32)
10712+
AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
1071210713
DAG.getConstant(0, DL, MVT::i64));
10713-
10714+
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
1071410715
if (IsParity)
10715-
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10716-
DAG.getConstant(1, DL, MVT::i32));
10717-
10718-
if (VT == MVT::i64)
10719-
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
10720-
return UaddLV;
10716+
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10717+
return AddV;
1072110718
} else if (VT == MVT::i128) {
1072210719
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
1072310720

1072410721
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10725-
SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10726-
UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10727-
DAG.getConstant(0, DL, MVT::i64));
10728-
10722+
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
10723+
AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
1072910724
if (IsParity)
10730-
UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10731-
DAG.getConstant(1, DL, MVT::i32));
10732-
10733-
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
10725+
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
10726+
return AddV;
1073410727
}
1073510728

1073610729
assert(!IsParity && "ISD::PARITY of vector types not supported");

llvm/test/CodeGen/AArch64/arm64-popcnt.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
88
; CHECK: // %bb.0:
99
; CHECK-NEXT: fmov s0, w0
1010
; CHECK-NEXT: cnt.8b v0, v0
11-
; CHECK-NEXT: uaddlv.8b h0, v0
11+
; CHECK-NEXT: addv.8b b0, v0
1212
; CHECK-NEXT: fmov w0, s0
1313
; CHECK-NEXT: ret
1414
;
@@ -43,7 +43,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
4343
; CHECK-NEXT: fmov w8, s0
4444
; CHECK-NEXT: fmov s0, w8
4545
; CHECK-NEXT: cnt.8b v0, v0
46-
; CHECK-NEXT: uaddlv.8b h0, v0
46+
; CHECK-NEXT: addv.8b b0, v0
4747
; CHECK-NEXT: fmov w0, s0
4848
; CHECK-NEXT: ret
4949
;
@@ -79,8 +79,8 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
7979
; CHECK: // %bb.0:
8080
; CHECK-NEXT: fmov d0, x0
8181
; CHECK-NEXT: cnt.8b v0, v0
82-
; CHECK-NEXT: uaddlv.8b h0, v0
83-
; CHECK-NEXT: fmov w0, s0
82+
; CHECK-NEXT: addv.8b b0, v0
83+
; CHECK-NEXT: fmov x0, d0
8484
; CHECK-NEXT: ret
8585
;
8686
; CHECK-NONEON-LABEL: cnt64_advsimd:

llvm/test/CodeGen/AArch64/dp1.ll

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -197,52 +197,58 @@ define void @cttz_zeroundef_i64() {
197197
}
198198

199199
define void @ctpop_i32() {
200-
; CHECK-LABEL: ctpop_i32:
201-
; CHECK: // %bb.0:
202-
; CHECK-NEXT: adrp x8, :got:var32
203-
; CHECK-NEXT: ldr x8, [x8, :got_lo12:var32]
204-
; CHECK-NEXT: ldr w9, [x8]
205-
; CHECK-NEXT: fmov d0, x9
206-
; CHECK-NEXT: cnt v0.8b, v0.8b
207-
; CHECK-NEXT: uaddlv h0, v0.8b
208-
; CHECK-NEXT: str s0, [x8]
209-
; CHECK-NEXT: ret
200+
; CHECK-SDAG-LABEL: ctpop_i32:
201+
; CHECK-SDAG: // %bb.0:
202+
; CHECK-SDAG-NEXT: adrp x8, :got:var32
203+
; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var32]
204+
; CHECK-SDAG-NEXT: ldr w9, [x8]
205+
; CHECK-SDAG-NEXT: fmov d0, x9
206+
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
207+
; CHECK-SDAG-NEXT: addv b0, v0.8b
208+
; CHECK-SDAG-NEXT: fmov w9, s0
209+
; CHECK-SDAG-NEXT: str w9, [x8]
210+
; CHECK-SDAG-NEXT: ret
211+
;
212+
; CHECK-GISEL-LABEL: ctpop_i32:
213+
; CHECK-GISEL: // %bb.0:
214+
; CHECK-GISEL-NEXT: adrp x8, :got:var32
215+
; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var32]
216+
; CHECK-GISEL-NEXT: ldr w9, [x8]
217+
; CHECK-GISEL-NEXT: fmov d0, x9
218+
; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
219+
; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
220+
; CHECK-GISEL-NEXT: str s0, [x8]
221+
; CHECK-GISEL-NEXT: ret
210222
%val0_tmp = load i32, ptr @var32
211223
%val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
212224
store volatile i32 %val4_tmp, ptr @var32
213225
ret void
214226
}
215227

216-
define void @ctpop_i64() {
217-
; CHECK-SDAG-LABEL: ctpop_i64:
228+
define i64 @popcnt(i64 %a, ptr %p) {
229+
; CHECK-SDAG-LABEL: popcnt:
218230
; CHECK-SDAG: // %bb.0:
219-
; CHECK-SDAG-NEXT: adrp x8, :got:var64
220-
; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var64]
221-
; CHECK-SDAG-NEXT: ldr d0, [x8]
231+
; CHECK-SDAG-NEXT: fmov d0, x0
232+
; CHECK-SDAG-NEXT: mov x0, xzr
222233
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
223-
; CHECK-SDAG-NEXT: uaddlv h0, v0.8b
224-
; CHECK-SDAG-NEXT: fmov w9, s0
225-
; CHECK-SDAG-NEXT: str x9, [x8]
234+
; CHECK-SDAG-NEXT: addv b0, v0.8b
235+
; CHECK-SDAG-NEXT: str d0, [x1]
226236
; CHECK-SDAG-NEXT: ret
227237
;
228-
; CHECK-GISEL-LABEL: ctpop_i64:
238+
; CHECK-GISEL-LABEL: popcnt:
229239
; CHECK-GISEL: // %bb.0:
230-
; CHECK-GISEL-NEXT: adrp x8, :got:var64
231-
; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var64]
232-
; CHECK-GISEL-NEXT: ldr x9, [x8]
233-
; CHECK-GISEL-NEXT: fmov d0, x9
240+
; CHECK-GISEL-NEXT: fmov d0, x0
241+
; CHECK-GISEL-NEXT: mov x0, xzr
234242
; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
235243
; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
236-
; CHECK-GISEL-NEXT: mov w9, v0.s[0]
237-
; CHECK-GISEL-NEXT: str x9, [x8]
244+
; CHECK-GISEL-NEXT: mov w8, v0.s[0]
245+
; CHECK-GISEL-NEXT: str x8, [x1]
238246
; CHECK-GISEL-NEXT: ret
239-
%val0_tmp = load i64, ptr @var64
240-
%val4_tmp = call i64 @llvm.ctpop.i64(i64 %val0_tmp)
241-
store volatile i64 %val4_tmp, ptr @var64
242-
ret void
247+
%2 = call i64 @llvm.ctpop(i64 %a)
248+
store i64 %2, ptr %p
249+
ret i64 0
243250
}
244251

245-
246252
declare i32 @llvm.bswap.i32(i32)
247253
declare i64 @llvm.bswap.i64(i64)
248254
declare i32 @llvm.ctlz.i32 (i32, i1)

llvm/test/CodeGen/AArch64/parity.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,9 @@ define i64 @parity_64(i64 %x) {
114114
; CHECK: // %bb.0:
115115
; CHECK-NEXT: fmov d0, x0
116116
; CHECK-NEXT: cnt v0.8b, v0.8b
117-
; CHECK-NEXT: uaddlv h0, v0.8b
118-
; CHECK-NEXT: fmov w8, s0
119-
; CHECK-NEXT: and w0, w8, #0x1
117+
; CHECK-NEXT: addv b0, v0.8b
118+
; CHECK-NEXT: fmov x8, d0
119+
; CHECK-NEXT: and x0, x8, #0x1
120120
; CHECK-NEXT: ret
121121
;
122122
; CHECK-CSSC-LABEL: parity_64:
@@ -136,9 +136,9 @@ define i128 @parity_128(i128 %x) {
136136
; CHECK-NEXT: mov v0.d[1], x1
137137
; CHECK-NEXT: mov x1, xzr
138138
; CHECK-NEXT: cnt v0.16b, v0.16b
139-
; CHECK-NEXT: uaddlv h0, v0.16b
140-
; CHECK-NEXT: fmov w8, s0
141-
; CHECK-NEXT: and w0, w8, #0x1
139+
; CHECK-NEXT: addv b0, v0.16b
140+
; CHECK-NEXT: fmov x8, d0
141+
; CHECK-NEXT: and x0, x8, #0x1
142142
; CHECK-NEXT: ret
143143
;
144144
; CHECK-CSSC-LABEL: parity_128:
@@ -158,8 +158,8 @@ define i32 @parity_64_trunc(i64 %x) {
158158
; CHECK: // %bb.0:
159159
; CHECK-NEXT: fmov d0, x0
160160
; CHECK-NEXT: cnt v0.8b, v0.8b
161-
; CHECK-NEXT: uaddlv h0, v0.8b
162-
; CHECK-NEXT: fmov w8, s0
161+
; CHECK-NEXT: addv b0, v0.8b
162+
; CHECK-NEXT: fmov x8, d0
163163
; CHECK-NEXT: and w0, w8, #0x1
164164
; CHECK-NEXT: ret
165165
;

llvm/test/CodeGen/AArch64/popcount.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
2828
; CHECK-NEXT: add x8, x0, #8
2929
; CHECK-NEXT: ld1 { v0.d }[1], [x8]
3030
; CHECK-NEXT: cnt v0.16b, v0.16b
31-
; CHECK-NEXT: uaddlv h0, v0.16b
31+
; CHECK-NEXT: addv b0, v0.16b
3232
; CHECK-NEXT: fmov w0, s0
3333
; CHECK-NEXT: ret
3434
;
@@ -104,8 +104,8 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
104104
; CHECK-NEXT: ld1 { v1.d }[1], [x8]
105105
; CHECK-NEXT: cnt v0.16b, v0.16b
106106
; CHECK-NEXT: cnt v1.16b, v1.16b
107-
; CHECK-NEXT: uaddlv h0, v0.16b
108-
; CHECK-NEXT: uaddlv h1, v1.16b
107+
; CHECK-NEXT: addv b0, v0.16b
108+
; CHECK-NEXT: addv b1, v1.16b
109109
; CHECK-NEXT: fmov w8, s0
110110
; CHECK-NEXT: fmov w9, s1
111111
; CHECK-NEXT: add w0, w9, w8
@@ -191,12 +191,10 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
191191
;
192192
; CHECK-LABEL: popcount1x128:
193193
; CHECK: // %bb.0: // %Entry
194-
; CHECK-NEXT: fmov d1, x0
195-
; CHECK-NEXT: movi v0.2d, #0000000000000000
196-
; CHECK-NEXT: mov v1.d[1], x1
197-
; CHECK-NEXT: cnt v1.16b, v1.16b
198-
; CHECK-NEXT: uaddlv h1, v1.16b
199-
; CHECK-NEXT: mov v0.s[0], v1.s[0]
194+
; CHECK-NEXT: fmov d0, x0
195+
; CHECK-NEXT: mov v0.d[1], x1
196+
; CHECK-NEXT: cnt v0.16b, v0.16b
197+
; CHECK-NEXT: addv b0, v0.16b
200198
; CHECK-NEXT: mov x1, v0.d[1]
201199
; CHECK-NEXT: fmov x0, d0
202200
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)