From 1262bd31f1fe83ba633d5e9cada988d7d17193a8 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 28 Nov 2025 14:27:48 +0000 Subject: [PATCH 1/5] [AArch64] Codegen test for select from canonical fixed-width AnyOf --- llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/neon-anyof-splat.ll diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll new file mode 100644 index 0000000000000..5a5b92feb2415 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -o - | FileCheck %s +target triple = "aarch64-linux-gnu" + +;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an +;; integer of a bitwidth equal to the number of lanes being reduced, then +;; compared against zero. To select between vectors for NEON, we then need to +;; broadcast the result, but we must be careful when the bitwidth of the scalar +;; result is smaller than the element size of the vectors being selected. We +;; don't want to end up with scalarization. + +define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: any_of_select_vf4: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: movi d3, #0000000000000000 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: movi v4.16b, #15 +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b +; CHECK-NEXT: dup v0.16b, v3.b[0] +; CHECK-NEXT: umov w8, v3.b[0] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: and x8, x8, #0xf +; CHECK-NEXT: bfi x8, x9, #4, #4 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: bfi x8, x10, #8, #4 +; CHECK-NEXT: umov w10, v0.b[4] +; CHECK-NEXT: bfi x8, x9, #12, #4 +; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: bfi x8, x10, #16, #4 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: bfi x8, x9, #20, #4 +; CHECK-NEXT: umov w9, v0.b[8] +; CHECK-NEXT: bfi x8, x10, #24, #4 +; CHECK-NEXT: lsl w10, w11, #28 +; CHECK-NEXT: umov w11, v0.b[9] +; CHECK-NEXT: orr x8, x8, x10 +; CHECK-NEXT: and w9, w9, #0xf +; CHECK-NEXT: umov w10, v0.b[10] +; CHECK-NEXT: orr x8, x8, x9, lsl #32 +; CHECK-NEXT: and w9, w11, #0xf +; CHECK-NEXT: umov w11, v0.b[11] +; CHECK-NEXT: orr x8, x8, x9, lsl #36 +; CHECK-NEXT: and w9, w10, #0xf +; CHECK-NEXT: umov w10, v0.b[12] +; CHECK-NEXT: orr x8, x8, x9, lsl #40 +; CHECK-NEXT: and w9, w11, #0xf +; CHECK-NEXT: umov w11, v0.b[13] +; CHECK-NEXT: orr x8, x8, x9, lsl #44 +; CHECK-NEXT: and w9, w10, #0xf +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: orr x8, x8, x9, lsl #48 +; CHECK-NEXT: and w9, w11, #0xf +; CHECK-NEXT: orr x8, x8, x9, lsl #52 +; CHECK-NEXT: umov w9, v0.b[15] +; CHECK-NEXT: and w10, w10, #0xf +; CHECK-NEXT: orr x8, x8, x10, lsl #56 +; CHECK-NEXT: orr x8, x8, x9, lsl #60 +; CHECK-NEXT: dup v0.2d, x8 +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %cmp = icmp slt <4 x i32> %mask, zeroinitializer + %cmp.bc = bitcast <4 x i1> %cmp to i4 + %cmp.bc.not = icmp eq i4 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %res +} + +define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: any_of_select_vf2: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: movi d3, #0000000000000000 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: movi v4.16b, #3 +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b +; CHECK-NEXT: dup v0.16b, v3.b[0] +; CHECK-NEXT: umov w8, v3.b[0] +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: umov w12, v0.b[8] +; CHECK-NEXT: and w8, w8, #0x3 +; CHECK-NEXT: umov w13, v0.b[3] +; CHECK-NEXT: umov w14, v0.b[4] +; CHECK-NEXT: umov w15, v0.b[10] +; CHECK-NEXT: umov w16, v0.b[5] +; CHECK-NEXT: bfi w8, w9, #2, #2 +; CHECK-NEXT: umov w9, v0.b[9] +; CHECK-NEXT: ubfiz w11, w11, #14, #2 +; CHECK-NEXT: ubfiz w12, w12, #16, #2 +; CHECK-NEXT: bfi w8, w10, #4, #2 +; CHECK-NEXT: umov w10, v0.b[11] +; CHECK-NEXT: ubfiz w15, w15, #20, #2 +; CHECK-NEXT: orr w11, w11, w12 +; CHECK-NEXT: umov w12, v0.b[13] +; CHECK-NEXT: bfi w8, w13, #6, #2 +; CHECK-NEXT: umov w13, v0.b[12] +; CHECK-NEXT: ubfiz w9, w9, #18, #2 +; CHECK-NEXT: bfi w8, w14, #8, #2 +; CHECK-NEXT: umov w14, v0.b[14] +; CHECK-NEXT: orr w9, w11, w9 +; CHECK-NEXT: umov w11, v0.b[6] +; CHECK-NEXT: ubfiz w10, w10, #22, #2 +; CHECK-NEXT: orr w9, w9, w15 +; CHECK-NEXT: ubfiz w13, w13, #24, #2 +; CHECK-NEXT: bfi w8, w16, #10, #2 +; CHECK-NEXT: orr w9, w9, w10 +; CHECK-NEXT: ubfiz w10, w12, #26, #2 +; CHECK-NEXT: orr w9, w9, w13 +; CHECK-NEXT: ubfiz w12, w14, #28, #2 +; CHECK-NEXT: umov w13, v0.b[15] +; CHECK-NEXT: bfi w8, w11, #12, #2 +; CHECK-NEXT: orr w9, w9, w10 +; CHECK-NEXT: orr w9, w9, w12 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w13, lsl #30 +; CHECK-NEXT: orr x8, x8, x8, lsl #32 +; CHECK-NEXT: dup v0.2d, x8 +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %cmp = icmp slt <2 x i64> %mask, zeroinitializer + %cmp.bc = bitcast <2 x i1> %cmp to i2 + %cmp.bc.not = icmp eq i2 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b + ret <2 x i64> %res +} From 7c8b20f18b3d692d57b60506fa967ffce221f9aa Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 28 Nov 2025 14:42:38 +0000 Subject: [PATCH 2/5] Avoid choosing a bad ElementCount for splatting the condition --- .../Target/AArch64/AArch64ISelLowering.cpp | 5 + llvm/test/CodeGen/AArch64/expand-select.ll | 50 +++----- llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 121 ++---------------- 3 files changed, 36 insertions(+), 140 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6072fd9d8f242..e6872dfe995d8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N, if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); + // Avoid creating vectors with excessive VFs for small types. + if (DCI.isBeforeLegalize() && + SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits()) + NumMaskElts = ResVT.getVectorNumElements(); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll index 1ca4719d9b6bf..8ad9ea3b7a8d5 100644 --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -4,20 +4,15 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: ldr x11, [sp] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldp x8, x10, [sp, #8] -; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: csel x8, x5, x8, ne -; CHECK-NEXT: csel x9, x4, x11, ne -; CHECK-NEXT: stp x9, x8, [x10, #16] -; CHECK-NEXT: csel x8, x3, x7, ne -; CHECK-NEXT: csel x9, x2, x6, ne -; CHECK-NEXT: stp x9, x8, [x10] +; CHECK-NEXT: ldp x8, x9, [sp, #8] +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: ldr x10, [sp] +; CHECK-NEXT: csel x8, x5, x8, eq +; CHECK-NEXT: csel x10, x4, x10, eq +; CHECK-NEXT: stp x10, x8, [x9, #16] +; CHECK-NEXT: csel x8, x3, x7, eq +; CHECK-NEXT: csel x10, x2, x6, eq +; CHECK-NEXT: stp x10, x8, [x9] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 @@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) { define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: and w8, w0, #0x1 -; CHECK-NEXT: ldr x10, [sp, #16] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: csel x11, x2, x6, ne -; CHECK-NEXT: str x11, [x10] -; CHECK-NEXT: csel x8, x4, x8, ne -; CHECK-NEXT: stur x8, [x10, #12] -; CHECK-NEXT: csel x8, x5, x9, ne -; CHECK-NEXT: csel x9, x3, x7, ne -; CHECK-NEXT: str w8, [x10, #20] -; CHECK-NEXT: str w9, [x10, #8] +; CHECK-NEXT: ldp x8, x10, [sp] +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: ldr x9, [sp, #16] +; CHECK-NEXT: csel x11, x2, x6, eq +; CHECK-NEXT: csel x8, x4, x8, eq +; CHECK-NEXT: str x11, [x9] +; CHECK-NEXT: stur x8, [x9, #12] +; CHECK-NEXT: csel x8, x5, x10, eq +; CHECK-NEXT: csel x10, x3, x7, eq +; CHECK-NEXT: str w8, [x9, #20] +; CHECK-NEXT: str w10, [x9, #8] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll index 5a5b92feb2415..43abb6ac9b944 100644 --- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -12,62 +12,13 @@ target triple = "aarch64-linux-gnu" define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: any_of_select_vf4: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-NEXT: movi v4.16b, #15 -; CHECK-NEXT: addv s0, v0.4s -; CHECK-NEXT: and v3.16b, v3.16b, v4.16b -; CHECK-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b -; CHECK-NEXT: dup v0.16b, v3.b[0] -; CHECK-NEXT: umov w8, v3.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[7] -; CHECK-NEXT: and x8, x8, #0xf -; CHECK-NEXT: bfi x8, x9, #4, #4 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: bfi x8, x10, #8, #4 -; CHECK-NEXT: umov w10, v0.b[4] -; CHECK-NEXT: bfi x8, x9, #12, #4 -; CHECK-NEXT: umov w9, v0.b[5] -; CHECK-NEXT: bfi x8, x10, #16, #4 -; CHECK-NEXT: umov w10, v0.b[6] -; CHECK-NEXT: bfi x8, x9, #20, #4 -; CHECK-NEXT: umov w9, v0.b[8] -; CHECK-NEXT: bfi x8, x10, #24, #4 -; CHECK-NEXT: lsl w10, w11, #28 -; CHECK-NEXT: umov w11, v0.b[9] -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: and w9, w9, #0xf -; CHECK-NEXT: umov w10, v0.b[10] -; CHECK-NEXT: orr x8, x8, x9, lsl #32 -; CHECK-NEXT: and w9, w11, #0xf -; CHECK-NEXT: umov w11, v0.b[11] -; CHECK-NEXT: orr x8, x8, x9, lsl #36 -; CHECK-NEXT: and w9, w10, #0xf -; CHECK-NEXT: umov w10, v0.b[12] -; CHECK-NEXT: orr x8, x8, x9, lsl #40 -; CHECK-NEXT: and w9, w11, #0xf -; CHECK-NEXT: umov w11, v0.b[13] -; CHECK-NEXT: orr x8, x8, x9, lsl #44 -; CHECK-NEXT: and w9, w10, #0xf -; CHECK-NEXT: umov w10, v0.b[14] -; CHECK-NEXT: orr x8, x8, x9, lsl #48 -; CHECK-NEXT: and w9, w11, #0xf -; CHECK-NEXT: orr x8, x8, x9, lsl #52 -; CHECK-NEXT: umov w9, v0.b[15] -; CHECK-NEXT: and w10, w10, #0xf -; CHECK-NEXT: orr x8, x8, x10, lsl #56 -; CHECK-NEXT: orr x8, x8, x9, lsl #60 -; CHECK-NEXT: dup v0.2d, x8 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %cmp = icmp slt <4 x i32> %mask, zeroinitializer %cmp.bc = bitcast <4 x i1> %cmp to i4 @@ -79,63 +30,13 @@ define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: any_of_select_vf2: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-NEXT: movi v4.16b, #3 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: and v3.16b, v3.16b, v4.16b -; CHECK-NEXT: and v0.16b, v0.16b, v4.16b -; CHECK-NEXT: cmeq v3.16b, v0.16b, v3.16b -; CHECK-NEXT: dup v0.16b, v3.b[0] -; CHECK-NEXT: umov w8, v3.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[7] -; CHECK-NEXT: umov w12, v0.b[8] -; CHECK-NEXT: and w8, w8, #0x3 -; CHECK-NEXT: umov w13, v0.b[3] -; CHECK-NEXT: umov w14, v0.b[4] -; CHECK-NEXT: umov w15, v0.b[10] -; CHECK-NEXT: umov w16, v0.b[5] -; CHECK-NEXT: bfi w8, w9, #2, #2 -; CHECK-NEXT: umov w9, v0.b[9] -; CHECK-NEXT: ubfiz w11, w11, #14, #2 -; CHECK-NEXT: ubfiz w12, w12, #16, #2 -; CHECK-NEXT: bfi w8, w10, #4, #2 -; CHECK-NEXT: umov w10, v0.b[11] -; CHECK-NEXT: ubfiz w15, w15, #20, #2 -; CHECK-NEXT: orr w11, w11, w12 -; CHECK-NEXT: umov w12, v0.b[13] -; CHECK-NEXT: bfi w8, w13, #6, #2 -; CHECK-NEXT: umov w13, v0.b[12] -; CHECK-NEXT: ubfiz w9, w9, #18, #2 -; CHECK-NEXT: bfi w8, w14, #8, #2 -; CHECK-NEXT: umov w14, v0.b[14] -; CHECK-NEXT: orr w9, w11, w9 -; CHECK-NEXT: umov w11, v0.b[6] -; CHECK-NEXT: ubfiz w10, w10, #22, #2 -; CHECK-NEXT: orr w9, w9, w15 -; CHECK-NEXT: ubfiz w13, w13, #24, #2 -; CHECK-NEXT: bfi w8, w16, #10, #2 -; CHECK-NEXT: orr w9, w9, w10 -; CHECK-NEXT: ubfiz w10, w12, #26, #2 -; CHECK-NEXT: orr w9, w9, w13 -; CHECK-NEXT: ubfiz w12, w14, #28, #2 -; CHECK-NEXT: umov w13, v0.b[15] -; CHECK-NEXT: bfi w8, w11, #12, #2 -; CHECK-NEXT: orr w9, w9, w10 -; CHECK-NEXT: orr w9, w9, w12 -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: orr w8, w8, w13, lsl #30 -; CHECK-NEXT: orr x8, x8, x8, lsl #32 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: dup v0.2d, x8 -; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %cmp = icmp slt <2 x i64> %mask, zeroinitializer %cmp.bc = bitcast <2 x i1> %cmp to i2 From dce36a4908bf03b2684425ce78afa12ecfb715c9 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 28 Nov 2025 16:56:43 +0000 Subject: [PATCH 3/5] Add suggested wider-than-legal test --- llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll index 43abb6ac9b944..b74a2c66108d0 100644 --- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -44,3 +44,36 @@ define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) %res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %res } + +define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: any_of_select_vf32: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: movi d6, #0000000000000000 +; CHECK-NEXT: and v1.16b, v1.16b, v7.16b +; CHECK-NEXT: and v0.16b, v0.16b, v7.16b +; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v16.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v7.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v16.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bfi w9, w8, #16, #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: cmeq v0.4s, v0.4s, v6.4s +; CHECK-NEXT: dup v1.4s, v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: bsl v1.16b, v3.16b, v5.16b +; CHECK-NEXT: bsl v0.16b, v2.16b, v4.16b +; CHECK-NEXT: ret + %cmp = icmp slt <32 x i8> %mask, zeroinitializer + %cmp.bc = bitcast <32 x i1> %cmp to i32 + %cmp.bc.not = icmp eq i32 %cmp.bc, 0 + %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b + ret <32 x i8> %res +} From 3be56c05cfdf13edbc47a462495ca12157cf17f9 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Fri, 28 Nov 2025 17:01:12 +0000 Subject: [PATCH 4/5] Always use ResVT's element count before legalization --- .../Target/AArch64/AArch64ISelLowering.cpp | 5 ++-- llvm/test/CodeGen/AArch64/neon-anyof-splat.ll | 28 ++++++------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e6872dfe995d8..d2c377b24ca2e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26965,9 +26965,8 @@ static SDValue performSelectCombine(SDNode *N, if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); - // Avoid creating vectors with excessive VFs for small types. - if (DCI.isBeforeLegalize() && - SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits()) + // Avoid creating vectors with excessive VFs before legalization. + if (DCI.isBeforeLegalize()) NumMaskElts = ResVT.getVectorNumElements(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); diff --git a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll index b74a2c66108d0..dedd4323f1519 100644 --- a/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll +++ b/llvm/test/CodeGen/AArch64/neon-anyof-splat.ll @@ -48,28 +48,16 @@ define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) define <32 x i8> @any_of_select_vf32(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) { ; CHECK-LABEL: any_of_select_vf32: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: movi d6, #0000000000000000 -; CHECK-NEXT: and v1.16b, v1.16b, v7.16b -; CHECK-NEXT: and v0.16b, v0.16b, v7.16b -; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v16.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v1.16b, v1.16b, v7.16b -; CHECK-NEXT: zip1 v0.16b, v0.16b, v16.16b -; CHECK-NEXT: addv h1, v1.8h -; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: bfi w9, w8, #16, #16 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: cmeq v0.4s, v0.4s, v6.4s -; CHECK-NEXT: dup v1.4s, v0.s[0] +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: dup v1.16b, w8 ; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: bsl v1.16b, v3.16b, v5.16b -; CHECK-NEXT: bsl v0.16b, v2.16b, v4.16b +; CHECK-NEXT: bsl v1.16b, v5.16b, v3.16b +; CHECK-NEXT: bsl v0.16b, v4.16b, v2.16b ; CHECK-NEXT: ret %cmp = icmp slt <32 x i8> %mask, zeroinitializer %cmp.bc = bitcast <32 x i1> %cmp to i32 From 85bf24f8d497337454b4fd724cffdc648dd0c66f Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 2 Dec 2025 10:42:46 +0000 Subject: [PATCH 5/5] Bail out if element count not equal, move comment --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d2c377b24ca2e..f0ad1289428e8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26949,25 +26949,24 @@ static SDValue performSelectCombine(SDNode *N, assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && "Scalar-SETCC feeding SELECT has unexpected result type!"); - // If NumMaskElts == 0, the comparison is larger than select result. The - // largest real NEON comparison is 64-bits per lane, which means the result is - // at most 32-bits and an illegal vector. Just bail out for now. - EVT SrcVT = N0.getOperand(0).getValueType(); - // Don't try to do this optimization when the setcc itself has i1 operands. // There are no legal vectors of i1, so this would be pointless. v1f16 is // ruled out to prevent the creation of setcc that need to be scalarized. + EVT SrcVT = N0.getOperand(0).getValueType(); if (SrcVT == MVT::i1 || (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16)) return SDValue(); - int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); + // If NumMaskElts == 0, the comparison is larger than select result. The + // largest real NEON comparison is 64-bits per lane, which means the result is + // at most 32-bits and an illegal vector. Just bail out for now. + unsigned NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); // Avoid creating vectors with excessive VFs before legalization. - if (DCI.isBeforeLegalize()) - NumMaskElts = ResVT.getVectorNumElements(); + if (DCI.isBeforeLegalize() && NumMaskElts != ResVT.getVectorNumElements()) + return SDValue(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger();