-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64] Use i32 extract from UADDV in popcount lowering. #140718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will anyext into the top bits. The instruction we create (UADDV) is known to be zeroes in the upper bits, so we can convert to a larger v2i32 vector and extract from there, similar to the operation currently performed for i64 types.
|
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesWe need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will Fixes #140707 Full diff: https://github.com/llvm/llvm-project/pull/140718.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 293292d47dd48..64a422a195437 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10852,13 +10852,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
- if (VT == MVT::i32)
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
- DAG.getConstant(0, DL, MVT::i64));
- else
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
- DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
- DAG.getConstant(0, DL, MVT::i64));
+ AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
+ VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
+ DAG.getConstant(0, DL, MVT::i64));
if (IsParity)
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
return AddV;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index e664e73594923..61f221988777f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -648,4 +648,113 @@ Entry:
ret <4 x i16> %1
}
+define i32 @ctpop_into_extract(ptr %p) {
+; CHECKO0-LABEL: ctpop_into_extract:
+; CHECKO0: // %bb.0:
+; CHECKO0-NEXT: mov w8, #-1 // =0xffffffff
+; CHECKO0-NEXT: // implicit-def: $d1
+; CHECKO0-NEXT: // implicit-def: $q0
+; CHECKO0-NEXT: fmov d0, d1
+; CHECKO0-NEXT: mov v0.s[0], w8
+; CHECKO0-NEXT: fmov d2, d0
+; CHECKO0-NEXT: ldr d0, [x0]
+; CHECKO0-NEXT: fmov s1, s0
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: fmov s1, w8
+; CHECKO0-NEXT: // kill: def $d1 killed $s1
+; CHECKO0-NEXT: cnt v1.8b, v1.8b
+; CHECKO0-NEXT: uaddlv h1, v1.8b
+; CHECKO0-NEXT: // kill: def $q1 killed $h1
+; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: // implicit-def: $q1
+; CHECKO0-NEXT: fmov d1, d2
+; CHECKO0-NEXT: mov v1.s[1], w8
+; CHECKO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECKO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECKO0-NEXT: str d0, [x0]
+; CHECKO0-NEXT: mov w0, wzr
+; CHECKO0-NEXT: ret
+;
+; CHECK-LABEL: ctpop_into_extract:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: cnt v1.8b, v1.8b
+; CHECK-NEXT: addv b1, v1.8b
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
+;
+; BE-LABEL: ctpop_into_extract:
+; BE: // %bb.0:
+; BE-NEXT: ld1 { v0.2s }, [x0]
+; BE-NEXT: movi v2.2d, #0xffffffffffffffff
+; BE-NEXT: mov x8, x0
+; BE-NEXT: mov w0, wzr
+; BE-NEXT: fmov w9, s0
+; BE-NEXT: fmov s1, w9
+; BE-NEXT: cnt v1.8b, v1.8b
+; BE-NEXT: addv b1, v1.8b
+; BE-NEXT: mov v2.s[1], v1.s[0]
+; BE-NEXT: sub v0.2s, v0.2s, v2.2s
+; BE-NEXT: st1 { v0.2s }, [x8]
+; BE-NEXT: ret
+;
+; GISEL-LABEL: ctpop_into_extract:
+; GISEL: // %bb.0:
+; GISEL-NEXT: ldr d0, [x0]
+; GISEL-NEXT: mov w9, #-1 // =0xffffffff
+; GISEL-NEXT: mov x8, x0
+; GISEL-NEXT: mov v2.s[0], w9
+; GISEL-NEXT: mov w0, wzr
+; GISEL-NEXT: fmov w10, s0
+; GISEL-NEXT: fmov s1, w10
+; GISEL-NEXT: cnt v1.8b, v1.8b
+; GISEL-NEXT: uaddlv h1, v1.8b
+; GISEL-NEXT: mov v2.s[1], v1.s[0]
+; GISEL-NEXT: sub v0.2s, v0.2s, v2.2s
+; GISEL-NEXT: str d0, [x8]
+; GISEL-NEXT: ret
+;
+; GISELO0-LABEL: ctpop_into_extract:
+; GISELO0: // %bb.0:
+; GISELO0-NEXT: mov w8, #-1 // =0xffffffff
+; GISELO0-NEXT: // implicit-def: $d1
+; GISELO0-NEXT: // implicit-def: $q0
+; GISELO0-NEXT: fmov d0, d1
+; GISELO0-NEXT: mov v0.s[0], w8
+; GISELO0-NEXT: fmov d2, d0
+; GISELO0-NEXT: ldr d0, [x0]
+; GISELO0-NEXT: fmov s1, s0
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: fmov s1, w8
+; GISELO0-NEXT: // kill: def $d1 killed $s1
+; GISELO0-NEXT: cnt v1.8b, v1.8b
+; GISELO0-NEXT: uaddlv h1, v1.8b
+; GISELO0-NEXT: // kill: def $q1 killed $h1
+; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: // implicit-def: $q1
+; GISELO0-NEXT: fmov d1, d2
+; GISELO0-NEXT: mov v1.s[1], w8
+; GISELO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; GISELO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; GISELO0-NEXT: str d0, [x0]
+; GISELO0-NEXT: mov w0, wzr
+; GISELO0-NEXT: ret
+ %1 = load <2 x i32>, ptr %p, align 4
+ %2 = extractelement <2 x i32> %1, i64 0
+ %3 = call i32 @llvm.ctpop.i32(i32 %2)
+ %4 = insertelement <2 x i32> <i32 -1, i32 poison>, i32 %3, i64 1
+ %5 = sub <2 x i32> %1, %4
+ store <2 x i32> %5, ptr %p, align 4
+ ret i32 0
+}
+
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
|
usha1830
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, Thanks!
|
Thanks |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/22791 Here is the relevant piece of the build log for the reference |
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.
Fixes #140707