-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[DAG] Fold trunc(abdu(x,y)) and trunc(abds(x,y)) if they have sufficient leading zero/sign bits #151471
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: woruyu (woruyu) ChangesSummaryThis PR resolves #147683 Full diff: https://github.com/llvm/llvm-project/pull/151471.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a43020ee62281..6aa2997b7b823 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -607,6 +607,7 @@ namespace {
SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL);
SDValue foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL);
+ SDValue foldAbdToNarrowType(EVT VT, SDNode *N, const SDLoc &DL);
SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
SDValue False, ISD::CondCode CC, const SDLoc &DL);
@@ -3925,6 +3926,38 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
return SDValue();
}
+// trunc (ABDU/S A, B)) → ABDU/S (trunc A), (trunc B)
+SDValue DAGCombiner::foldAbdToNarrowType(EVT VT, SDNode *N, const SDLoc &DL) {
+ SDValue Op = N->getOperand(0);
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode != ISD::ABDU && Opcode != ISD::ABDS)
+ return SDValue();
+
+ EVT SrcVT = Op.getValueType();
+ EVT TruncVT = N->getValueType(0);
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
+ unsigned NumTruncBits = TruncVT.getScalarSizeInBits();
+ unsigned NeededBits = NumSrcBits - NumTruncBits;
+
+ bool CanFold = false;
+
+ if (Opcode == ISD::ABDU) {
+ KnownBits Known = DAG.computeKnownBits(Op);
+ CanFold = Known.countMinLeadingZeros() >= NeededBits;
+ } else {
+ unsigned SignBits = DAG.ComputeNumSignBits(Op);
+ CanFold = SignBits >= NeededBits;
+ }
+
+ if (CanFold) {
+ SDValue NewOp0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op.getOperand(0));
+ SDValue NewOp1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op.getOperand(1));
+ return DAG.getNode(Opcode, DL, TruncVT, NewOp0, NewOp1);
+ }
+
+ return SDValue();
+}
+
// Refinement of DAG/Type Legalisation (promotion) when CTLZ is used for
// counting leading ones. Broadly, it replaces the substraction with a left
// shift.
@@ -16275,6 +16308,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
+ // fold trunc (ABDU/S A, B)) → ABDU/S (trunc A), (trunc B)
+ if (SDValue V = foldAbdToNarrowType(VT, N, SDLoc(N)))
+ return V;
+
// Narrow a suitable binary operation with a non-opaque constant operand by
// moving it ahead of the truncate. This is limited to pre-legalization
// because targets may prefer a wider type during later combines and invert
diff --git a/llvm/test/CodeGen/AArch64/abd-combine.ll b/llvm/test/CodeGen/AArch64/abd-combine.ll
index d0257890d2c43..843a459beecf8 100644
--- a/llvm/test/CodeGen/AArch64/abd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/abd-combine.ll
@@ -17,12 +17,9 @@ define <8 x i16> @abdu_base(<8 x i16> %src1, <8 x i16> %src2) {
define <8 x i16> @abdu_const(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -34,12 +31,9 @@ define <8 x i16> @abdu_const(<8 x i16> %src1) {
define <8 x i16> @abdu_const_lhs(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const_lhs:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -51,6 +45,10 @@ define <8 x i16> @abdu_const_lhs(<8 x i16> %src1) {
define <8 x i16> @abdu_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const_zero:
; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: abs v0.4h, v0.4h
+; CHECK-NEXT: abs v1.4h, v1.4h
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
@@ -318,12 +316,9 @@ define <8 x i16> @abds_base(<8 x i16> %src1, <8 x i16> %src2) {
define <8 x i16> @abds_const(<8 x i16> %src1) {
; CHECK-LABEL: abds_const:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -335,12 +330,9 @@ define <8 x i16> @abds_const(<8 x i16> %src1) {
define <8 x i16> @abds_const_lhs(<8 x i16> %src1) {
; CHECK-LABEL: abds_const_lhs:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -352,11 +344,10 @@ define <8 x i16> @abds_const_lhs(<8 x i16> %src1) {
define <8 x i16> @abds_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: abds_const_zero:
; CHECK: // %bb.0:
-; CHECK-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: abs v0.4s, v0.4s
-; CHECK-NEXT: abs v1.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: abs v0.4h, v0.4h
+; CHECK-NEXT: abs v1.4h, v1.4h
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 6c7ddd916abdf..ccd1917ae3d85 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -575,3 +575,69 @@ define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) {
%6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
ret <4 x i32> %6
}
+
+define <4 x i16> @trunc_abdu_foldable(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: trunc_abdu_foldable:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uabd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: trunc_abdu_foldable:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
+ %ext_a = zext <4 x i16> %a to <4 x i32>
+ %ext_b = zext <4 x i16> %b to <4 x i32>
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %ext_a, <4 x i32> %ext_b)
+ %trunc = trunc <4 x i32> %abd to <4 x i16>
+ ret <4 x i16> %trunc
+}
+
+define <4 x i16> @trunc_abds_foldable(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: trunc_abds_foldable:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sabd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: trunc_abds_foldable:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
+ %a32 = sext <4 x i16> %a to <4 x i32>
+ %b32 = sext <4 x i16> %b to <4 x i32>
+ %abd32 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a32, <4 x i32> %b32)
+ %res16 = trunc <4 x i32> %abd32 to <4 x i16>
+ ret <4 x i16> %res16
+}
+
+define <4 x i16> @trunc_abdu_not_foldable(<4 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: trunc_abdu_not_foldable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %ext_a = zext <4 x i16> %a to <4 x i32>
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %ext_a, <4 x i32> %b)
+ %trunc = trunc <4 x i32> %abd to <4 x i16>
+ ret <4 x i16> %trunc
+}
+
+define <4 x i16> @truncate_abds_testcase1(<4 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: truncate_abds_testcase1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %a32 = sext <4 x i16> %a to <4 x i32>
+ %abd32 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a32, <4 x i32> %b)
+ %res16 = trunc <4 x i32> %abd32 to <4 x i16>
+ ret <4 x i16> %res16
+}
|
|
@llvm/pr-subscribers-llvm-selectiondag Author: woruyu (woruyu) ChangesSummaryThis PR resolves #147683 Full diff: https://github.com/llvm/llvm-project/pull/151471.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a43020ee62281..6aa2997b7b823 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -607,6 +607,7 @@ namespace {
SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
const SDLoc &DL);
SDValue foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL);
+ SDValue foldAbdToNarrowType(EVT VT, SDNode *N, const SDLoc &DL);
SDValue foldABSToABD(SDNode *N, const SDLoc &DL);
SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
SDValue False, ISD::CondCode CC, const SDLoc &DL);
@@ -3925,6 +3926,38 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
return SDValue();
}
+// trunc (ABDU/S A, B)) → ABDU/S (trunc A), (trunc B)
+SDValue DAGCombiner::foldAbdToNarrowType(EVT VT, SDNode *N, const SDLoc &DL) {
+ SDValue Op = N->getOperand(0);
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode != ISD::ABDU && Opcode != ISD::ABDS)
+ return SDValue();
+
+ EVT SrcVT = Op.getValueType();
+ EVT TruncVT = N->getValueType(0);
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
+ unsigned NumTruncBits = TruncVT.getScalarSizeInBits();
+ unsigned NeededBits = NumSrcBits - NumTruncBits;
+
+ bool CanFold = false;
+
+ if (Opcode == ISD::ABDU) {
+ KnownBits Known = DAG.computeKnownBits(Op);
+ CanFold = Known.countMinLeadingZeros() >= NeededBits;
+ } else {
+ unsigned SignBits = DAG.ComputeNumSignBits(Op);
+ CanFold = SignBits >= NeededBits;
+ }
+
+ if (CanFold) {
+ SDValue NewOp0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op.getOperand(0));
+ SDValue NewOp1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op.getOperand(1));
+ return DAG.getNode(Opcode, DL, TruncVT, NewOp0, NewOp1);
+ }
+
+ return SDValue();
+}
+
// Refinement of DAG/Type Legalisation (promotion) when CTLZ is used for
// counting leading ones. Broadly, it replaces the substraction with a left
// shift.
@@ -16275,6 +16308,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
+ // fold trunc (ABDU/S A, B)) → ABDU/S (trunc A), (trunc B)
+ if (SDValue V = foldAbdToNarrowType(VT, N, SDLoc(N)))
+ return V;
+
// Narrow a suitable binary operation with a non-opaque constant operand by
// moving it ahead of the truncate. This is limited to pre-legalization
// because targets may prefer a wider type during later combines and invert
diff --git a/llvm/test/CodeGen/AArch64/abd-combine.ll b/llvm/test/CodeGen/AArch64/abd-combine.ll
index d0257890d2c43..843a459beecf8 100644
--- a/llvm/test/CodeGen/AArch64/abd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/abd-combine.ll
@@ -17,12 +17,9 @@ define <8 x i16> @abdu_base(<8 x i16> %src1, <8 x i16> %src2) {
define <8 x i16> @abdu_const(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -34,12 +31,9 @@ define <8 x i16> @abdu_const(<8 x i16> %src1) {
define <8 x i16> @abdu_const_lhs(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const_lhs:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: ushll v2.4s, v0.4h, #0
-; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -51,6 +45,10 @@ define <8 x i16> @abdu_const_lhs(<8 x i16> %src1) {
define <8 x i16> @abdu_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: abdu_const_zero:
; CHECK: // %bb.0:
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: abs v0.4h, v0.4h
+; CHECK-NEXT: abs v1.4h, v1.4h
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
@@ -318,12 +316,9 @@ define <8 x i16> @abds_base(<8 x i16> %src1, <8 x i16> %src2) {
define <8 x i16> @abds_const(<8 x i16> %src1) {
; CHECK-LABEL: abds_const:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> %zextsrc1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -335,12 +330,9 @@ define <8 x i16> @abds_const(<8 x i16> %src1) {
define <8 x i16> @abds_const_lhs(<8 x i16> %src1) {
; CHECK-LABEL: abds_const_lhs:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.4s, #1
-; CHECK-NEXT: sshll v2.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sabd v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: movi v1.4h, #1
+; CHECK-NEXT: mov v1.d[1], v1.d[0]
+; CHECK-NEXT: sabd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -352,11 +344,10 @@ define <8 x i16> @abds_const_lhs(<8 x i16> %src1) {
define <8 x i16> @abds_const_zero(<8 x i16> %src1) {
; CHECK-LABEL: abds_const_zero:
; CHECK: // %bb.0:
-; CHECK-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT: abs v0.4s, v0.4s
-; CHECK-NEXT: abs v1.4s, v1.4s
-; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: abs v0.4h, v0.4h
+; CHECK-NEXT: abs v1.4h, v1.4h
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
%sub = sub <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, %zextsrc1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 6c7ddd916abdf..ccd1917ae3d85 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -575,3 +575,69 @@ define <4 x i32> @knownbits_sabd_and_mul_mask(<4 x i32> %a0, <4 x i32> %a1) {
%6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
ret <4 x i32> %6
}
+
+define <4 x i16> @trunc_abdu_foldable(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: trunc_abdu_foldable:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: uabd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: trunc_abdu_foldable:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: uabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
+ %ext_a = zext <4 x i16> %a to <4 x i32>
+ %ext_b = zext <4 x i16> %b to <4 x i32>
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %ext_a, <4 x i32> %ext_b)
+ %trunc = trunc <4 x i32> %abd to <4 x i16>
+ ret <4 x i16> %trunc
+}
+
+define <4 x i16> @trunc_abds_foldable(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-SD-LABEL: trunc_abds_foldable:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sabd v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: trunc_abds_foldable:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: ret
+ %a32 = sext <4 x i16> %a to <4 x i32>
+ %b32 = sext <4 x i16> %b to <4 x i32>
+ %abd32 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a32, <4 x i32> %b32)
+ %res16 = trunc <4 x i32> %abd32 to <4 x i16>
+ ret <4 x i16> %res16
+}
+
+define <4 x i16> @trunc_abdu_not_foldable(<4 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: trunc_abdu_not_foldable:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uabd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %ext_a = zext <4 x i16> %a to <4 x i32>
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %ext_a, <4 x i32> %b)
+ %trunc = trunc <4 x i32> %abd to <4 x i16>
+ ret <4 x i16> %trunc
+}
+
+define <4 x i16> @truncate_abds_testcase1(<4 x i16> %a, <4 x i32> %b) {
+; CHECK-LABEL: truncate_abds_testcase1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: sabd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %a32 = sext <4 x i16> %a to <4 x i32>
+ %abd32 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a32, <4 x i32> %b)
+ %res16 = trunc <4 x i32> %abd32 to <4 x i16>
+ ret <4 x i16> %res16
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
…ent leading zero/sign bits
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its very likely that this patch and #152273 can share the same code inside visitTRUNCATE
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Summary
This PR resolves #147683