llvm · huntergr-arm · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N,
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
 
+  // Avoid creating vectors with excessive VFs for small types.
+  if (DCI.isBeforeLegalize() &&
+      SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
+    NumMaskElts = ResVT.getVectorNumElements();
+
   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 

diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -4,20 +4,15 @@
 define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x11, [sp]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x8, x10, [sp, #8]
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    csel x8, x5, x8, ne
-; CHECK-NEXT:    csel x9, x4, x11, ne
-; CHECK-NEXT:    stp x9, x8, [x10, #16]
-; CHECK-NEXT:    csel x8, x3, x7, ne
-; CHECK-NEXT:    csel x9, x2, x6, ne
-; CHECK-NEXT:    stp x9, x8, [x10]
+; CHECK-NEXT:    ldp x8, x9, [sp, #8]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x10, [sp]
+; CHECK-NEXT:    csel x8, x5, x8, eq
+; CHECK-NEXT:    csel x10, x4, x10, eq
+; CHECK-NEXT:    stp x10, x8, [x9, #16]
+; CHECK-NEXT:    csel x8, x3, x7, eq
+; CHECK-NEXT:    csel x10, x2, x6, eq
+; CHECK-NEXT:    stp x10, x8, [x9]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
@@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK-LABEL: bar:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    and w8, w0, #0x1
-; CHECK-NEXT:    ldr x10, [sp, #16]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    tst w9, #0x1
-; CHECK-NEXT:    ldp x8, x9, [sp]
-; CHECK-NEXT:    csel x11, x2, x6, ne
-; CHECK-NEXT:    str x11, [x10]
-; CHECK-NEXT:    csel x8, x4, x8, ne
-; CHECK-NEXT:    stur x8, [x10, #12]
-; CHECK-NEXT:    csel x8, x5, x9, ne
-; CHECK-NEXT:    csel x9, x3, x7, ne
-; CHECK-NEXT:    str w8, [x10, #20]
-; CHECK-NEXT:    str w9, [x10, #8]
+; CHECK-NEXT:    ldp x8, x10, [sp]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    ldr x9, [sp, #16]
+; CHECK-NEXT:    csel x11, x2, x6, eq
+; CHECK-NEXT:    csel x8, x4, x8, eq
+; CHECK-NEXT:    str x11, [x9]
+; CHECK-NEXT:    stur x8, [x9, #12]
+; CHECK-NEXT:    csel x8, x5, x10, eq
+; CHECK-NEXT:    csel x10, x3, x7, eq
+; CHECK-NEXT:    str w8, [x9, #20]
+; CHECK-NEXT:    str w10, [x9, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0

diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+target triple = "aarch64-linux-gnu"
+
+;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
+;; integer of a bitwidth equal to the number of lanes being reduced, then
+;; compared against zero. To select between vectors for NEON, we then need to
+;; broadcast the result, but we must be careful when the bitwidth of the scalar
+;; result is smaller than the element size of the vectors being selected. We
+;; don't want to end up with scalarization.
+
+define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: any_of_select_vf4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <4 x i32> %mask, zeroinitializer
+  %cmp.bc = bitcast <4 x i1> %cmp to i4
+  %cmp.bc.not = icmp eq i4 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: any_of_select_vf2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csetm x8, ne
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    ret
+  %cmp = icmp slt <2 x i64> %mask, zeroinitializer
+  %cmp.bc = bitcast <2 x i1> %cmp to i2
+  %cmp.bc.not = icmp eq i2 %cmp.bc, 0
+  %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %res
+}