Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,9 @@ class LLVM_ABI TargetLoweringBase {
return hasAndNotCompare(X);
}

/// Return true if the target has a bitwise or-not operation:
virtual bool hasOrNot(SDValue X) const { return false; }

/// Return true if the target has a bit-test instruction:
/// (X & (1 << Y)) ==/!= 0
/// This knowledge can be used to prevent breaking the pattern,
Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
}

// TODO: There's another pattern in this family, but it may require
// implementing hasOrNot() to check for profitability:
// (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
// If we have to invert the sign bit mask and OR with -1, only do that
// transform if the target has a bitwise 'or not' instruction (the invert is
// free). (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
SDLoc DL(N);
SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
SDValue Not = DAG.getNOT(DL, Sra, VT);
return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
}

return SDValue();
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
return VT.getFixedSizeInBits() >= 64; // vector 'bic'
}

bool hasOrNot(SDValue X) const override {
EVT VT = X.getValueType();

if (!VT.isVector())
return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'

if (VT.isScalableVector())
return true;

return VT.getFixedSizeInBits() >= 64; // vector 'orn'
}

bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
unsigned OldShiftOpcode, unsigned NewShiftOpcode,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}

bool ARMTargetLowering::hasOrNot(SDValue Y) const {
// We can use orns for any scalar.
EVT VT = Y.getValueType();
if (!VT.isVector())
return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
return VT.getFixedSizeInBits() >= 64; // vector 'orn'
return false;
}

// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
// split into multiple extending loads, which are simpler to deal with than an
// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,8 @@ class VectorType;
return V.getValueType().isScalarInteger();
}

bool hasOrNot(SDValue Y) const override;

bool
isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
Expand Down
127 changes: 127 additions & 0 deletions llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s

; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
; This pattern should be optimized to use 'orn' instruction on AArch64

define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar:
; CHECK: // %bb.0:
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
ret <4 x i32> %sel
}

define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
%sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
ret <2 x i64> %sel
}

define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vselect_signbit_orn_vector:
; CHECK: // %bb.0:
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
ret <4 x i32> %sel
}

define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: vselect_signbit_orn_vector64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmge v0.2d, v0.2d, #0
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
%sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
ret <2 x i64> %sel
}

; Test with different constant values for N2
define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
; CHECK-LABEL: vselect_signbit_orn_scalar_const:
; CHECK: // %bb.0:
; CHECK-NEXT: cmge v0.4s, v0.4s, #0
; CHECK-NEXT: orr v0.4s, #42
; CHECK-NEXT: ret
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
ret <4 x i32> %sel
}

; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vselect_signbit_not_orn:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
ret <4 x i32> %sel
}

define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: test_orn_instruction_direct:
; CHECK: // %bb.0:
; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
%result = or <4 x i32> %x, %not_y
ret <4 x i32> %result
}

; Scalar versions of the same tests
define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmn w0, #1
; CHECK-NEXT: csinv w0, w1, wzr, le
; CHECK-NEXT: ret
%cmp = icmp sgt i32 %x, -1
%sel = select i1 %cmp, i32 -1, i32 %y
ret i32 %sel
}

define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmn x0, #1
; CHECK-NEXT: csinv x0, x1, xzr, le
; CHECK-NEXT: ret
%cmp = icmp sgt i64 %x, -1
%sel = select i1 %cmp, i64 -1, i64 %y
ret i64 %sel
}

define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
; CHECK-LABEL: test_orn_instruction_scalar_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: orn w0, w0, w1
; CHECK-NEXT: ret
%not_y = xor i32 %y, -1
%result = or i32 %x, %not_y
ret i32 %result
}

define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
; CHECK-LABEL: test_orn_instruction_scalar_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: orn x0, x0, x1
; CHECK-NEXT: ret
%not_y = xor i64 %y, -1
%result = or i64 %x, %not_y
ret i64 %result
}

143 changes: 143 additions & 0 deletions llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s

; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
; The DAGCombiner optimization transforms the select into the expected pattern,
; but further optimizations convert it to a more efficient sequence

define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vselect_signbit_orn_vector:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vshr.s32 q8, q8, #31
; CHECK-NEXT: vorn q8, q9, q8
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: bx lr
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
ret <4 x i32> %sel
}

define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: vselect_signbit_orn_vector64:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vshr.s64 q8, q8, #63
; CHECK-NEXT: vorn q8, q9, q8
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: bx lr
%cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
%sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
ret <2 x i64> %sel
}

; Test with different constant values for N2
define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
; CHECK-LABEL: vselect_signbit_orn_const:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vshr.s32 q8, q8, #31
; CHECK-NEXT: vmvn q8, q8
; CHECK-NEXT: vorr.i32 q8, #0x2a
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: bx lr
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
ret <4 x i32> %sel
}

; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vselect_signbit_not_orn:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vshr.s32 q8, q8, #31
; CHECK-NEXT: vand q8, q8, q9
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: bx lr
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
ret <4 x i32> %sel
}

; Test to demonstrate that orn instruction is available when the pattern matches directly
define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: test_orn_instruction_direct:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
; CHECK-NEXT: vorn q8, q8, q9
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: bx lr
%not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
%result = or <4 x i32> %x, %not_y
ret <4 x i32> %result
}

; Scalar versions of the same tests
define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: cmn r0, #1
; CHECK-NEXT: mvngt r1, #0
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: bx lr
%cmp = icmp sgt i32 %x, -1
%sel = select i1 %cmp, i32 -1, i32 %y
ret i32 %sel
}

define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
; CHECK: @ %bb.0:
; CHECK-NEXT: cmn r1, #1
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mvngt r3, #0
; CHECK-NEXT: mvngt r0, #0
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bx lr
%cmp = icmp sgt i64 %x, -1
%sel = select i1 %cmp, i64 -1, i64 %y
ret i64 %sel
}

define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
; CHECK-LABEL: test_orn_instruction_scalar_i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: mvn r1, r1
; CHECK-NEXT: orr r0, r0, r1
; CHECK-NEXT: bx lr
%not_y = xor i32 %y, -1
%result = or i32 %x, %not_y
ret i32 %result
}

define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
; CHECK-LABEL: test_orn_instruction_scalar_i64:
; CHECK: @ %bb.0:
; CHECK-NEXT: mvn r2, r2
; CHECK-NEXT: orr r0, r0, r2
; CHECK-NEXT: mvn r2, r3
; CHECK-NEXT: orr r1, r1, r2
; CHECK-NEXT: bx lr
%not_y = xor i64 %y, -1
%result = or i64 %x, %not_y
ret i64 %result
}