Skip to content

Conversation

AZero13
Copy link
Contributor

@AZero13 AZero13 commented Oct 17, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-arm

Author: AZero13 (AZero13)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/163995.diff

7 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5)
  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-3)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+12)
  • (modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+10)
  • (modified) llvm/lib/Target/ARM/ARMISelLowering.h (+2)
  • (added) llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll (+127)
  • (added) llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll (+143)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..322c7b9068255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -827,6 +827,11 @@ class LLVM_ABI TargetLoweringBase {
     return hasAndNotCompare(X);
   }
 
+  /// Return true if the target has a bitwise or-not operation:
+  virtual bool hasOrNot(SDValue X) const {
+    return false;
+  }
+
   /// Return true if the target has a bit-test instruction:
   ///   (X & (1 << Y)) ==/!= 0
   /// This knowledge can be used to prevent breaking the pattern,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..737471bc9d9e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
     return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
   }
 
-  // TODO: There's another pattern in this family, but it may require
-  //       implementing hasOrNot() to check for profitability:
-  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+  // If we have to invert the sign bit mask and OR with -1, only do that
+  // transform if the target has a bitwise 'or not' instruction (the invert is free).
+  // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+  if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    SDValue Not = DAG.getNOT(DL, Sra, VT);
+    return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
+  }
 
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..3488f361f37e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
     return VT.getFixedSizeInBits() >= 64; // vector 'bic'
   }
 
+  bool hasOrNot(SDValue X) const override {
+    EVT VT = X.getValueType();
+
+    if (!VT.isVector())
+      return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'
+
+    if (VT.isScalableVector())
+      return true;
+
+    return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+  }
+
   bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
       SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
       unsigned OldShiftOpcode, unsigned NewShiftOpcode,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..1092d0117a58a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
   return SDValue();
 }
 
+bool ARMTargetLowering::hasOrNot(SDValue Y) const {
+  // We can use orns for any scalar.
+  EVT VT = Y.getValueType();
+  if (!VT.isVector())
+    return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
+  if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
+    return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+  return false;
+}
+
 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
 // split into multiple extending loads, which are simpler to deal with than an
 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..993b63f21c07a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -614,6 +614,8 @@ class VectorType;
       return V.getValueType().isScalarInteger();
     }
 
+    bool hasOrNot(SDValue Y) const override;
+
     bool
     isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..c60d140cd6049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; This pattern should be optimized to use 'orn' instruction on AArch64
+
+define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.4s, #42
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %result = or <4 x i32> %x, %not_y
+  ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn w0, #1
+; CHECK-NEXT:    csinv w0, w1, wzr, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x, -1
+  %sel = select i1 %cmp, i32 -1, i32 %y
+  ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, #1
+; CHECK-NEXT:    csinv x0, x1, xzr, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %x, -1
+  %sel = select i1 %cmp, i64 -1, i64 %y
+  ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn w0, w0, w1
+; CHECK-NEXT:    ret
+  %not_y = xor i32 %y, -1
+  %result = or i32 %x, %not_y
+  ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn x0, x0, x1
+; CHECK-NEXT:    ret
+  %not_y = xor i64 %y, -1
+  %result = or i64 %x, %not_y
+  ret i64 %result
+}
+
diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..1a0d63745dfa9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; The DAGCombiner optimization transforms the select into the expected pattern,
+; but further optimizations convert it to a more efficient sequence
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vorn q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s64 q8, q8, #63
+; CHECK-NEXT:    vorn q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_const:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vmvn q8, q8
+; CHECK-NEXT:    vorr.i32 q8, #0x2a
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+; Test to demonstrate that orn instruction is available when the pattern matches directly
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vorn q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %result = or <4 x i32> %x, %not_y
+  ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmn r0, #1
+; CHECK-NEXT:    mvngt r1, #0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt i32 %x, -1
+  %sel = select i1 %cmp, i32 -1, i32 %y
+  ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mvngt r3, #0
+; CHECK-NEXT:    mvngt r0, #0
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt i64 %x, -1
+  %sel = select i1 %cmp, i64 -1, i64 %y
+  ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r1, r1
+; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    bx lr
+  %not_y = xor i32 %y, -1
+  %result = or i32 %x, %not_y
+  ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r2, r2
+; CHECK-NEXT:    orr r0, r0, r2
+; CHECK-NEXT:    mvn r2, r3
+; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    bx lr
+  %not_y = xor i64 %y, -1
+  %result = or i64 %x, %not_y
+  ret i64 %result
+}

@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-backend-aarch64

Author: AZero13 (AZero13)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/163995.diff

7 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5)
  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-3)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+12)
  • (modified) llvm/lib/Target/ARM/ARMISelLowering.cpp (+10)
  • (modified) llvm/lib/Target/ARM/ARMISelLowering.h (+2)
  • (added) llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll (+127)
  • (added) llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll (+143)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..322c7b9068255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -827,6 +827,11 @@ class LLVM_ABI TargetLoweringBase {
     return hasAndNotCompare(X);
   }
 
+  /// Return true if the target has a bitwise or-not operation:
+  virtual bool hasOrNot(SDValue X) const {
+    return false;
+  }
+
   /// Return true if the target has a bit-test instruction:
   ///   (X & (1 << Y)) ==/!= 0
   /// This knowledge can be used to prevent breaking the pattern,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c97300d64d455..737471bc9d9e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12257,9 +12257,16 @@ static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
     return DAG.getNode(ISD::AND, DL, VT, Not, DAG.getFreeze(N2));
   }
 
-  // TODO: There's another pattern in this family, but it may require
-  //       implementing hasOrNot() to check for profitability:
-  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+  // If we have to invert the sign bit mask and OR with -1, only do that
+  // transform if the target has a bitwise 'or not' instruction (the invert is free).
+  // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+  if (isAllOnesOrAllOnesSplat(N2) && TLI.hasOrNot(N2)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getShiftAmountConstant(EltSizeInBits - 1, VT, DL);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    SDValue Not = DAG.getNOT(DL, Sra, VT);
+    return DAG.getNode(ISD::OR, DL, VT, Not, DAG.getFreeze(N1));
+  }
 
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..3488f361f37e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -414,6 +414,18 @@ class AArch64TargetLowering : public TargetLowering {
     return VT.getFixedSizeInBits() >= 64; // vector 'bic'
   }
 
+  bool hasOrNot(SDValue X) const override {
+    EVT VT = X.getValueType();
+
+    if (!VT.isVector())
+      return VT == MVT::i32 || VT == MVT::i64; // scalar 'orn'
+
+    if (VT.isScalableVector())
+      return true;
+
+    return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+  }
+
   bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
       SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
       unsigned OldShiftOpcode, unsigned NewShiftOpcode,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..1092d0117a58a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -17788,6 +17788,16 @@ static SDValue PerformShiftCombine(SDNode *N,
   return SDValue();
 }
 
+bool ARMTargetLowering::hasOrNot(SDValue Y) const {
+  // We can use orns for any scalar.
+  EVT VT = Y.getValueType();
+  if (!VT.isVector())
+    return Subtarget->isThumb2() && VT == MVT::i32; // scalar 'orn'
+  if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps())
+    return VT.getFixedSizeInBits() >= 64; // vector 'orn'
+  return false;
+}
+
 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be
 // split into multiple extending loads, which are simpler to deal with than an
 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..993b63f21c07a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -614,6 +614,8 @@ class VectorType;
       return V.getValueType().isScalarInteger();
     }
 
+    bool hasOrNot(SDValue Y) const override;
+
     bool
     isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..c60d140cd6049
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; This pattern should be optimized to use 'orn' instruction on AArch64
+
+define <4 x i32> @vselect_signbit_orn_scalar(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_scalar64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.2d, v0.2d, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_scalar_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmge v0.4s, v0.4s, #0
+; CHECK-NEXT:    orr v0.4s, #42
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %result = or <4 x i32> %x, %not_y
+  ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn w0, #1
+; CHECK-NEXT:    csinv w0, w1, wzr, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %x, -1
+  %sel = select i1 %cmp, i32 -1, i32 %y
+  ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, #1
+; CHECK-NEXT:    csinv x0, x1, xzr, le
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %x, -1
+  %sel = select i1 %cmp, i64 -1, i64 %y
+  ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn w0, w0, w1
+; CHECK-NEXT:    ret
+  %not_y = xor i32 %y, -1
+  %result = or i32 %x, %not_y
+  ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orn x0, x0, x1
+; CHECK-NEXT:    ret
+  %not_y = xor i64 %y, -1
+  %result = or i64 %x, %not_y
+  ret i64 %result
+}
+
diff --git a/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
new file mode 100644
index 0000000000000..1a0d63745dfa9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-vselect-signbit-orn.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=armv7-unknown-unknown | FileCheck %s
+
+; Test for the optimization: (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | freeze(N2)
+; The DAGCombiner optimization transforms the select into the expected pattern,
+; but further optimizations convert it to a more efficient sequence
+
+define <4 x i32> @vselect_signbit_orn_vector(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vorn q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @vselect_signbit_orn_vector64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: vselect_signbit_orn_vector64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s64 q8, q8, #63
+; CHECK-NEXT:    vorn q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <2 x i64> %x, <i64 -1, i64 -1>
+  %sel = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %y
+  ret <2 x i64> %sel
+}
+
+; Test with different constant values for N2
+define <4 x i32> @vselect_signbit_orn_const(<4 x i32> %x) {
+; CHECK-LABEL: vselect_signbit_orn_const:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vmvn q8, q8
+; CHECK-NEXT:    vorr.i32 q8, #0x2a
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+  ret <4 x i32> %sel
+}
+
+; Test the inverse pattern to ensure it doesn't get optimized (should use different instruction)
+define <4 x i32> @vselect_signbit_not_orn(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vselect_signbit_not_orn:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vshr.s32 q8, q8, #31
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %sel = select <4 x i1> %cmp, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> %y
+  ret <4 x i32> %sel
+}
+
+; Test to demonstrate that orn instruction is available when the pattern matches directly
+define <4 x i32> @test_orn_instruction_direct(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_orn_instruction_direct:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vorn q8, q8, q9
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    bx lr
+  %not_y = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %result = or <4 x i32> %x, %not_y
+  ret <4 x i32> %result
+}
+
+; Scalar versions of the same tests
+define i32 @vselect_signbit_orn_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmn r0, #1
+; CHECK-NEXT:    mvngt r1, #0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt i32 %x, -1
+  %sel = select i1 %cmp, i32 -1, i32 %y
+  ret i32 %sel
+}
+
+define i64 @vselect_signbit_orn_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: vselect_signbit_orn_scalar_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmn r1, #1
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mvngt r3, #0
+; CHECK-NEXT:    mvngt r0, #0
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    bx lr
+  %cmp = icmp sgt i64 %x, -1
+  %sel = select i1 %cmp, i64 -1, i64 %y
+  ret i64 %sel
+}
+
+define i32 @test_orn_instruction_scalar_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r1, r1
+; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    bx lr
+  %not_y = xor i32 %y, -1
+  %result = or i32 %x, %not_y
+  ret i32 %result
+}
+
+define i64 @test_orn_instruction_scalar_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: test_orn_instruction_scalar_i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mvn r2, r2
+; CHECK-NEXT:    orr r0, r0, r2
+; CHECK-NEXT:    mvn r2, r3
+; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    bx lr
+  %not_y = xor i64 %y, -1
+  %result = or i64 %x, %not_y
+  ret i64 %result
+}

Copy link

github-actions bot commented Oct 17, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants