Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26965,6 +26965,11 @@ static SDValue performSelectCombine(SDNode *N,
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();

// Avoid creating vectors with excessive VFs for small types.
if (DCI.isBeforeLegalize() &&
SrcVT.getSizeInBits() < ResVT.getScalarSizeInBits())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would think we always want to use NumMaskElts before type legalisation, that would also improve e.g.

define <32 x i8> @any_of_select_vf8(<32 x i8> %mask, <32 x i8> %a, <32 x i8> %b) {
  %cmp = icmp slt <32 x i8> %mask, zeroinitializer                                
  %cmp.bc = bitcast <32 x i1> %cmp to i32                                         
  %cmp.bc.not = icmp eq i32 %cmp.bc, 0                                            
  %res = select i1 %cmp.bc.not, <32 x i8> %a, <32 x i8> %b                        
  ret <32 x i8> %res                                                              
}                                                                                 

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always using the elementcount of ResVT before legalization does seem to be better, yes. Thanks.

NumMaskElts = ResVT.getVectorNumElements();

SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

Expand Down
50 changes: 20 additions & 30 deletions llvm/test/CodeGen/AArch64/expand-select.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,15 @@
define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: and w8, w0, #0x1
; CHECK-NEXT: ldr x11, [sp]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: ldp x8, x10, [sp, #8]
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: tst w9, #0x1
; CHECK-NEXT: csel x8, x5, x8, ne
; CHECK-NEXT: csel x9, x4, x11, ne
; CHECK-NEXT: stp x9, x8, [x10, #16]
; CHECK-NEXT: csel x8, x3, x7, ne
; CHECK-NEXT: csel x9, x2, x6, ne
; CHECK-NEXT: stp x9, x8, [x10]
; CHECK-NEXT: ldp x8, x9, [sp, #8]
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: ldr x10, [sp]
; CHECK-NEXT: csel x8, x5, x8, eq
; CHECK-NEXT: csel x10, x4, x10, eq
; CHECK-NEXT: stp x10, x8, [x9, #16]
; CHECK-NEXT: csel x8, x3, x7, eq
; CHECK-NEXT: csel x10, x2, x6, eq
; CHECK-NEXT: stp x10, x8, [x9]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
Expand All @@ -31,22 +26,17 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-LABEL: bar:
; CHECK: // %bb.0:
; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: and w8, w0, #0x1
; CHECK-NEXT: ldr x10, [sp, #16]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: tst w9, #0x1
; CHECK-NEXT: ldp x8, x9, [sp]
; CHECK-NEXT: csel x11, x2, x6, ne
; CHECK-NEXT: str x11, [x10]
; CHECK-NEXT: csel x8, x4, x8, ne
; CHECK-NEXT: stur x8, [x10, #12]
; CHECK-NEXT: csel x8, x5, x9, ne
; CHECK-NEXT: csel x9, x3, x7, ne
; CHECK-NEXT: str w8, [x10, #20]
; CHECK-NEXT: str w9, [x10, #8]
; CHECK-NEXT: ldp x8, x10, [sp]
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: ldr x9, [sp, #16]
; CHECK-NEXT: csel x11, x2, x6, eq
; CHECK-NEXT: csel x8, x4, x8, eq
; CHECK-NEXT: str x11, [x9]
; CHECK-NEXT: stur x8, [x9, #12]
; CHECK-NEXT: csel x8, x5, x10, eq
; CHECK-NEXT: csel x10, x3, x7, eq
; CHECK-NEXT: str w8, [x9, #20]
; CHECK-NEXT: str w10, [x9, #8]
; CHECK-NEXT: ret
%cond = and i32 %In1, 1
%cbool = icmp eq i32 %cond, 0
Expand Down
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-anyof-splat.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc %s -o - | FileCheck %s
target triple = "aarch64-linux-gnu"

;; An 'AnyOf' reduction (vector.reduce.or) is instcombined to a bitcast to an
;; integer of a bitwidth equal to the number of lanes being reduced, then
;; compared against zero. To select between vectors for NEON, we then need to
;; broadcast the result, but we must be careful when the bitwidth of the scalar
;; result is smaller than the element size of the vectors being selected. We
;; don't want to end up with scalarization.

define <4 x i32> @any_of_select_vf4(<4 x i32> %mask, <4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: any_of_select_vf4:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: umaxv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tst w8, #0x1
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: dup v0.4s, w8
; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp slt <4 x i32> %mask, zeroinitializer
%cmp.bc = bitcast <4 x i1> %cmp to i4
%cmp.bc.not = icmp eq i4 %cmp.bc, 0
%res = select i1 %cmp.bc.not, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %res
}

define <2 x i64> @any_of_select_vf2(<2 x i64> %mask, <2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: any_of_select_vf2:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: umaxv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tst w8, #0x1
; CHECK-NEXT: csetm x8, ne
; CHECK-NEXT: dup v0.2d, x8
; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
; CHECK-NEXT: ret
%cmp = icmp slt <2 x i64> %mask, zeroinitializer
%cmp.bc = bitcast <2 x i1> %cmp to i2
%cmp.bc.not = icmp eq i2 %cmp.bc, 0
%res = select i1 %cmp.bc.not, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %res
}