[AArch64] Improve select dagcombine (#169925)

huntergr-arm · web-flow · commit 0e6d6127d43d · 2025-12-02T12:29:49.000Z
An AnyOf reduction (aka vector.reduce.or) with a fixed-width vector is
canonicalized to a bitcast of the mask vector to an integer of the same
overall size, which is then compared against zero.

If the scalar result of the bitcast is smaller than the element size of
vectors being selected, we often end up with suboptimal codegen. This
fixes the main cases, removing scalarized code.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26983,22 +26983,25 @@ static SDValue performSelectCombine(SDNode *N,
   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
          "Scalar-SETCC feeding SELECT has unexpected result type!");
 
-  // If NumMaskElts == 0, the comparison is larger than select result. The
-  // largest real NEON comparison is 64-bits per lane, which means the result is
-  // at most 32-bits and an illegal vector. Just bail out for now.
-  EVT SrcVT = N0.getOperand(0).getValueType();
-
   // Don't try to do this optimization when the setcc itself has i1 operands.
   // There are no legal vectors of i1, so this would be pointless. v1f16 is
   // ruled out to prevent the creation of setcc that need to be scalarized.
+  EVT SrcVT = N0.getOperand(0).getValueType();
   if (SrcVT == MVT::i1 ||
       (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
     return SDValue();
 
-  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
+  unsigned NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
 
+  // Avoid creating vectors with excessive VFs before legalization.
+  if (DCI.isBeforeLegalize() && NumMaskElts != ResVT.getVectorNumElements())
+    return SDValue();
+
   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,15 @@
 define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x11, [sp]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x8, x10, [sp, #8]
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    csel x8, x5, x8, ne
-; CHECK-NEXT:    csel x9, x4, x11, ne
-; CHECK-NEXT:    stp x9, x8, [x10, #16]
-; CHECK-NEXT:    csel x8, x3, x7, ne
-; CHECK-NEXT:    csel x9, x2, x6, ne
-; CHECK-NEXT:    stp x9, x8, [x10]
+; CHECK-NEXT:    ldp x8, x9, [sp, #8]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x10, [sp]
+; CHECK-NEXT:    csel x8, x5, x8, eq
+; CHECK-NEXT:    csel x10, x4, x10, eq
+; CHECK-NEXT:    stp x10, x8, [x9, #16]
+; CHECK-NEXT:    csel x8, x3, x7, eq
+; CHECK-NEXT:    csel x10, x2, x6, eq
+; CHECK-NEXT:    stp x10, x8, [x9]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
@@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK-LABEL: bar:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x10, [sp, #16]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    ldp x8, x9, [sp]
-; CHECK-NEXT:    csel x11, x2, x6, ne
-; CHECK-NEXT:    str x11, [x10]
-; CHECK-NEXT:    csel x8, x4, x8, ne
-; CHECK-NEXT:    stur x8, [x10, #12]
-; CHECK-NEXT:    csel x8, x5, x9, ne
-; CHECK-NEXT:    csel x9, x3, x7, ne
-; CHECK-NEXT:    str w8, [x10, #20]
-; CHECK-NEXT:    str w9, [x10, #8]
+; CHECK-NEXT:    ldp x8, x10, [sp]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x9, [sp, #16]
+; CHECK-NEXT:    csel x11, x2, x6, eq
+; CHECK-NEXT:    csel x8, x4, x8, eq
+; CHECK-NEXT:    str x11, [x9]
+; CHECK-NEXT:    stur x8, [x9, #12]
+; CHECK-NEXT:    csel x8, x5, x10, eq
+; CHECK-NEXT:    csel x10, x3, x7, eq
+; CHECK-NEXT:    str w8, [x9, #20]
+; CHECK-NEXT:    str w10, [x9, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
+;; integer of a bitwidth equal to the number of lanes being reduced, then
+;; compared against zero. To select between vectors for NEON, we then need to
+;; broadcast the result, but we must be careful when the bitwidth of the scalar
+;; result is smaller than the element size of the vectors being selected. We
+;; don't want to end up with scalarization.
+
+define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: any_of_select_vf4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %mask, zeroinitializer
+  %cmp.bc = bitcast <4 x i1> %cmp to i4
+  %cmp.bc.not = icmp eq i4 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: any_of_select_vf2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm x8, ne
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i64> %mask, zeroinitializer
+  %cmp.bc = bitcast <2 x i1> %cmp to i2
+  %cmp.bc.not = icmp eq i2 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %res
+}
+
+define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: any_of_select_vf32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v1.16b, w8
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    bsl v1.16b, v5.16b, v3.16b
+; CHECK-NEXT:    bsl v0.16b, v4.16b, v2.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <32 x i8> %mask, zeroinitializer
+  %cmp.bc = bitcast <32 x i1> %cmp to i32
+  %cmp.bc.not = icmp eq i32 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b
+  ret <32 x i8> %res
+}