diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 73f2c55a71125..1feb749b7b627 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -827,6 +827,9 @@ class LLVM_ABI TargetLoweringBase { return hasAndNotCompare(X); } + /// Return true if the target has a bitwise or-not operation: + virtual bool hasOrNot(SDValue X) const { return false; } + /// Return true if the target has a bit-test instruction: /// (X & (1 << Y)) ==/!= 0 /// This knowledge can be used to prevent breaking the pattern, diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d64d455..7f141b66926b8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2)); } - // TODO: There's another pattern in this family, but it may require - // implementing hasOrNot() to check for profitability: - // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2) + // If we have to invert the sign bit mask and OR with -1, only do that + // transform if the target has a bitwise 'or not' instruction (the invert is + // free). (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2) + if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) { + SDLoc DL(N); + SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL); + SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt); + SDValue Not = DAG.getNOT(DL, Sra, VT); + return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1)); + } return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 00956fdc8e48e..3488f361f37e5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering { return VT.getFixedSizeInBits() >= 64; // vector 'bic' } + bool hasOrNot(SDValue X) const override { + EVT VT = X.getValueType(); + + if (!VT.isVector()) + return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn' + + if (VT.isScalableVector()) + return true; + + return VT.getFixedSizeInBits() >= 64; // vector 'orn' + } + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 67ea2dd3df792..1092d0117a58a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } +bool ARMTargetLowering::hasOrNot(SDValue Y) const { + // We can use orns for any scalar. + EVT VT = Y.getValueType(); + if (!VT.isVector()) + return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn' + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) + return VT.getFixedSizeInBits() >= 64; // vector 'orn' + return false; +} + // Look for a sign/zero/fpextend extend of a larger than legal load. This can be // split into multiple extending loads, which are simpler to deal with than an // arbitrary extend. For fp extends we use an integer extending load and a VCVTL diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 70aa001a41885..993b63f21c07a 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -614,6 +614,8 @@ class VectorType; return V.getValueType().isScalarInteger(); } + bool hasOrNot(SDValue Y) const override; + bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll new file mode 100644 index 0000000000000..c60d140cd6049 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2) +; This pattern should be optimized to use 'orn' instruction on AArch64 + +define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> %y + ret <4 x i32> %sel +} + +define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2d, v0.2d, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp sgt <2 x i64> %x, + %sel = select <2 x i1> %cmp, <2 x i64> , <2 x i64> %y + ret <2 x i64> %sel +} + +define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vselect_signbit_orn_vector: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> %y + ret <4 x i32> %sel +} + +define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: vselect_signbit_orn_vector64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2d, v0.2d, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp sgt <2 x i64> %x, + %sel = select <2 x i1> %cmp, <2 x i64> , <2 x i64> %y + ret <2 x i64> %sel +} + +; Test with different constant values for N2 +define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) { +; CHECK-LABEL: vselect_signbit_orn_scalar_const: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: orr v0.4s, #42 +; CHECK-NEXT: ret + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> + ret <4 x i32> %sel +} + +; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction) +define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vselect_signbit_not_orn: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> %y + ret <4 x i32> %sel +} + +define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_orn_instruction_direct: +; CHECK: // %bb.0: +; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %not_y = xor <4 x i32> %y, + %result = or <4 x i32> %x, %not_y + ret <4 x i32> %result +} + +; Scalar versions of the same tests +define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmn w0, #1 +; CHECK-NEXT: csinv w0, w1, wzr, le +; CHECK-NEXT: ret + %cmp = icmp sgt i32 %x, -1 + %sel = select i1 %cmp, i32 -1, i32 %y + ret i32 %sel +} + +define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmn x0, #1 +; CHECK-NEXT: csinv x0, x1, xzr, le +; CHECK-NEXT: ret + %cmp = icmp sgt i64 %x, -1 + %sel = select i1 %cmp, i64 -1, i64 %y + ret i64 %sel +} + +define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) { +; CHECK-LABEL: test_orn_instruction_scalar_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: orn w0, w0, w1 +; CHECK-NEXT: ret + %not_y = xor i32 %y, -1 + %result = or i32 %x, %not_y + ret i32 %result +} + +define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) { +; CHECK-LABEL: test_orn_instruction_scalar_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: orn x0, x0, x1 +; CHECK-NEXT: ret + %not_y = xor i64 %y, -1 + %result = or i64 %x, %not_y + ret i64 %result +} + diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll new file mode 100644 index 0000000000000..1a0d63745dfa9 --- /dev/null +++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s + +; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2) +; The DAGCombiner optimization transforms the select into the expected pattern, +; but further optimizations convert it to a more efficient sequence + +define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vselect_signbit_orn_vector: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vshr.s32 q8, q8, #31 +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> %y + ret <4 x i32> %sel +} + +define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) { +; CHECK-LABEL: vselect_signbit_orn_vector64: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vshr.s64 q8, q8, #63 +; CHECK-NEXT: vorn q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %cmp = icmp sgt <2 x i64> %x, + %sel = select <2 x i1> %cmp, <2 x i64> , <2 x i64> %y + ret <2 x i64> %sel +} + +; Test with different constant values for N2 +define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) { +; CHECK-LABEL: vselect_signbit_orn_const: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vshr.s32 q8, q8, #31 +; CHECK-NEXT: vmvn q8, q8 +; CHECK-NEXT: vorr.i32 q8, #0x2a +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> + ret <4 x i32> %sel +} + +; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction) +define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: vselect_signbit_not_orn: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vshr.s32 q8, q8, #31 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %cmp = icmp sgt <4 x i32> %x, + %sel = select <4 x i1> %cmp, <4 x i32> , <4 x i32> %y + ret <4 x i32> %sel +} + +; Test to demonstrate that orn instruction is available when the pattern matches directly +define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_orn_instruction_direct: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vorn q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr + %not_y = xor <4 x i32> %y, + %result = or <4 x i32> %x, %not_y + ret <4 x i32> %result +} + +; Scalar versions of the same tests +define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmn r0, #1 +; CHECK-NEXT: mvngt r1, #0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: bx lr + %cmp = icmp sgt i32 %x, -1 + %sel = select i1 %cmp, i32 -1, i32 %y + ret i32 %sel +} + +define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) { +; CHECK-LABEL: vselect_signbit_orn_scalar_i64: +; CHECK: @ %bb.0: +; CHECK-NEXT: cmn r1, #1 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mvngt r3, #0 +; CHECK-NEXT: mvngt r0, #0 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: bx lr + %cmp = icmp sgt i64 %x, -1 + %sel = select i1 %cmp, i64 -1, i64 %y + ret i64 %sel +} + +define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) { +; CHECK-LABEL: test_orn_instruction_scalar_i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r1, r1 +; CHECK-NEXT: orr r0, r0, r1 +; CHECK-NEXT: bx lr + %not_y = xor i32 %y, -1 + %result = or i32 %x, %not_y + ret i32 %result +} + +define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) { +; CHECK-LABEL: test_orn_instruction_scalar_i64: +; CHECK: @ %bb.0: +; CHECK-NEXT: mvn r2, r2 +; CHECK-NEXT: orr r0, r0, r2 +; CHECK-NEXT: mvn r2, r3 +; CHECK-NEXT: orr r1, r1, r2 +; CHECK-NEXT: bx lr + %not_y = xor i64 %y, -1 + %result = or i64 %x, %not_y + ret i64 %result +}