-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Implement hasOrNot #163995
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
AZero13
wants to merge
1
commit into
llvm:main
Choose a base branch
from
AZero13:subs-dead
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Implement hasOrNot #163995
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-arm Author: AZero13 (AZero13) ChangesFull diff: https://github.com/llvm/llvm-project/pull/163995.diff 7 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..322c7b9068255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -827,6 +827,11 @@ class LLVM_ABI TargetLoweringBase {
return hasAndNotCompare(X);
}
+ /// Return true if the target has a bitwise or-not operation:
+ virtual bool hasOrNot(SDValue X) const {
+ return false;
+ }
+
/// Return true if the target has a bit-test instruction:
/// (X & (1 << Y)) ==/!= 0
/// This knowledge can be used to prevent breaking the pattern,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..737471bc9d9e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
}
- // TODO: There's another pattern in this family, but it may require
- // implementing hasOrNot() to check for profitability:
- // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ // If we have to invert the sign bit mask and OR with -1, only do that
+ // transform if the target has a bitwise 'or not' instruction (the invert is free).
+ // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
+ SDLoc DL(N);
+ SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
+ SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+ SDValue Not = DAG.getNOT(DL, Sra, VT);
+ return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
+ }
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..3488f361f37e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
return VT.getFixedSizeInBits() >= 64; // vector 'bic'
}
+ bool hasOrNot(SDValue X) const override {
+ EVT VT = X.getValueType();
+
+ if (!VT.isVector())
+ return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'
+
+ if (VT.isScalableVector())
+ return true;
+
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ }
+
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..1092d0117a58a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
+bool ARMTargetLowering::hasOrNot(SDValue Y) const {
+ // We can use orns for any scalar.
+ EVT VT = Y.getValueType();
+ if (!VT.isVector())
+ return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ return false;
+}
+
// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
// split into multiple extending loads, which are simpler to deal with than an
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..993b63f21c07a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -614,6 +614,8 @@ class VectorType;
return V.getValueType().isScalarInteger();
}
+ bool hasOrNot(SDValue Y) const override;
+
bool
isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..c60d140cd6049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; This pattern should be optimized to use 'orn' instruction on AArch64
+
+define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.4s, #42
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn w0, #1
+; CHECK-NEXT: csinv w0, w1, wzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn x0, #1
+; CHECK-NEXT: csinv x0, x1, xzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn w0, w0, w1
+; CHECK-NEXT: ret
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn x0, x0, x1
+; CHECK-NEXT: ret
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
+
diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..1a0d63745dfa9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; The DAGCombiner optimization transforms the select into the expected pattern,
+; but further optimizations convert it to a more efficient sequence
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s64 q8, q8, #63
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_const:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vmvn q8, q8
+; CHECK-NEXT: vorr.i32 q8, #0x2a
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vand q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+; Test to demonstrate that orn instruction is available when the pattern matches directly
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vorn q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r0, #1
+; CHECK-NEXT: mvngt r1, #0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r1, #1
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mvngt r3, #0
+; CHECK-NEXT: mvngt r0, #0
+; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r1, r1
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: bx lr
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r2, r2
+; CHECK-NEXT: orr r0, r0, r2
+; CHECK-NEXT: mvn r2, r3
+; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: bx lr
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
|
@llvm/pr-subscribers-backend-aarch64 Author: AZero13 (AZero13) ChangesFull diff: https://github.com/llvm/llvm-project/pull/163995.diff 7 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..322c7b9068255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -827,6 +827,11 @@ class LLVM_ABI TargetLoweringBase {
return hasAndNotCompare(X);
}
+ /// Return true if the target has a bitwise or-not operation:
+ virtual bool hasOrNot(SDValue X) const {
+ return false;
+ }
+
/// Return true if the target has a bit-test instruction:
/// (X & (1 << Y)) ==/!= 0
/// This knowledge can be used to prevent breaking the pattern,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..737471bc9d9e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
}
- // TODO: There's another pattern in this family, but it may require
- // implementing hasOrNot() to check for profitability:
- // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ // If we have to invert the sign bit mask and OR with -1, only do that
+ // transform if the target has a bitwise 'or not' instruction (the invert is free).
+ // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+ if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
+ SDLoc DL(N);
+ SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
+ SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+ SDValue Not = DAG.getNOT(DL, Sra, VT);
+ return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
+ }
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..3488f361f37e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
return VT.getFixedSizeInBits() >= 64; // vector 'bic'
}
+ bool hasOrNot(SDValue X) const override {
+ EVT VT = X.getValueType();
+
+ if (!VT.isVector())
+ return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'
+
+ if (VT.isScalableVector())
+ return true;
+
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ }
+
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..1092d0117a58a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
+bool ARMTargetLowering::hasOrNot(SDValue Y) const {
+ // We can use orns for any scalar.
+ EVT VT = Y.getValueType();
+ if (!VT.isVector())
+ return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
+ if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
+ return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+ return false;
+}
+
// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
// split into multiple extending loads, which are simpler to deal with than an
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..993b63f21c07a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -614,6 +614,8 @@ class VectorType;
return V.getValueType().isScalarInteger();
}
+ bool hasOrNot(SDValue Y) const override;
+
bool
isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..c60d140cd6049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; This pattern should be optimized to use 'orn' instruction on AArch64
+
+define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_const:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: orr v0.4s, #42
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn w0, #1
+; CHECK-NEXT: csinv w0, w1, wzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn x0, #1
+; CHECK-NEXT: csinv x0, x1, xzr, le
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn w0, w0, w1
+; CHECK-NEXT: ret
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orn x0, x0, x1
+; CHECK-NEXT: ret
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
+
diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..1a0d63745dfa9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; The DAGCombiner optimization transforms the select into the expected pattern,
+; but further optimizations convert it to a more efficient sequence
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s64 q8, q8, #63
+; CHECK-NEXT: vorn q8, q9, q8
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+ %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+ ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_const:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vmvn q8, q8
+; CHECK-NEXT: vorr.i32 q8, #0x2a
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+ ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vshr.s32 q8, q8, #31
+; CHECK-NEXT: vand q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+ ret <4 x i32> %sel
+}
+
+; Test to demonstrate that orn instruction is available when the pattern matches directly
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vmov d17, r2, r3
+; CHECK-NEXT: vmov d16, r0, r1
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT: vorn q8, q8, q9
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: bx lr
+ %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %result = or <4 x i32> %x, %not_y
+ ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r0, #1
+; CHECK-NEXT: mvngt r1, #0
+; CHECK-NEXT: mov r0, r1
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i32 %x, -1
+ %sel = select i1 %cmp, i32 -1, i32 %y
+ ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: cmn r1, #1
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mvngt r3, #0
+; CHECK-NEXT: mvngt r0, #0
+; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: bx lr
+ %cmp = icmp sgt i64 %x, -1
+ %sel = select i1 %cmp, i64 -1, i64 %y
+ ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r1, r1
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: bx lr
+ %not_y = xor i32 %y, -1
+ %result = or i32 %x, %not_y
+ ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: mvn r2, r2
+; CHECK-NEXT: orr r0, r0, r2
+; CHECK-NEXT: mvn r2, r3
+; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: bx lr
+ %not_y = xor i64 %y, -1
+ %result = or i64 %x, %not_y
+ ret i64 %result
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.