Skip to content

Conversation

@wzssyqa
Copy link
Contributor

@wzssyqa wzssyqa commented Sep 22, 2024

If SETCC or VSELECT is not legal for vector, we should not expand it,
instead we can split the vectors.

So that, some simple scale instructions can be emitted instead of
some pairs of comparation+selection.

@llvmbot llvmbot added backend:ARM llvm:SelectionDAG SelectionDAGISel as well labels Sep 22, 2024
@llvmbot
Copy link
Member

llvmbot commented Sep 22, 2024

@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-arm

Author: YunQiang Su (wzssyqa)

Changes

If we are working on an vector, and the operation is legal for the elemtns of it, just skip will be better than expand it. So that, just some simple scale instructions are emit instead of some compares and selections.


Patch is 31.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109570.diff

3 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+4)
  • (modified) llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll (+90-162)
  • (modified) llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll (+76-188)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a2a232ed93b72f..617946e4bc2a7e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8422,6 +8422,10 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
           Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) &&
          "Wrong opcode");
 
+  EVT VT = Node->getValueType(0);
+  if (VT.isVector() && isOperationLegal(Opcode, VT.getScalarType()))
+    return SDValue();
+
   if (Node->getFlags().hasNoNaNs()) {
     ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
     SDValue Op1 = Node->getOperand(0);
diff --git a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
index 528bfe0411730a..975f3860fb1ef6 100644
--- a/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/minnum-maxnum-intrinsics.ll
@@ -918,32 +918,24 @@ define <2 x double> @fminnumv264_intrinsic(<2 x double> %x, <2 x double> %y) {
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    mov r12, sp
 ; ARMV8-NEXT:    vld1.64 {d16, d17}, [r12]
-; ARMV8-NEXT:    vmov d18, r0, r1
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d16, d18
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vcmp.f64 d17, d19
-; ARMV8-NEXT:    vselgt.f64 d18, d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d18
-; ARMV8-NEXT:    vselgt.f64 d16, d19, d17
+; ARMV8-NEXT:    vmov d19, r0, r1
+; ARMV8-NEXT:    vmov d18, r2, r3
+; ARMV8-NEXT:    vminnm.f64 d19, d19, d16
+; ARMV8-NEXT:    vminnm.f64 d16, d18, d17
+; ARMV8-NEXT:    vmov r0, r1, d19
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fminnumv264_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    mov r12, sp
-; ARMV8M-NEXT:    vmov d0, r0, r1
+; ARMV8M-NEXT:    vmov d0, r2, r3
 ; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
-; ARMV8M-NEXT:    vmov d1, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d2, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vcmp.f64 d3, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d0, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d0
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d3
-; ARMV8M-NEXT:    vmov r2, r3, d1
+; ARMV8M-NEXT:    vmov d1, r0, r1
+; ARMV8M-NEXT:    vminnm.f64 d1, d1, d2
+; ARMV8M-NEXT:    vminnm.f64 d0, d0, d3
+; ARMV8M-NEXT:    vmov r0, r1, d1
+; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y)
   ret <2 x double> %a
@@ -970,32 +962,24 @@ define <2 x double> @fminnumv264_nsz_intrinsic(<2 x double> %x, <2 x double> %y)
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    mov r12, sp
 ; ARMV8-NEXT:    vld1.64 {d16, d17}, [r12]
-; ARMV8-NEXT:    vmov d18, r0, r1
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d16, d18
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vcmp.f64 d17, d19
-; ARMV8-NEXT:    vselgt.f64 d18, d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d18
-; ARMV8-NEXT:    vselgt.f64 d16, d19, d17
+; ARMV8-NEXT:    vmov d19, r0, r1
+; ARMV8-NEXT:    vmov d18, r2, r3
+; ARMV8-NEXT:    vminnm.f64 d19, d19, d16
+; ARMV8-NEXT:    vminnm.f64 d16, d18, d17
+; ARMV8-NEXT:    vmov r0, r1, d19
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fminnumv264_nsz_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    mov r12, sp
-; ARMV8M-NEXT:    vmov d0, r0, r1
+; ARMV8M-NEXT:    vmov d0, r2, r3
 ; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
-; ARMV8M-NEXT:    vmov d1, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d2, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vcmp.f64 d3, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d0, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d0
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d3
-; ARMV8M-NEXT:    vmov r2, r3, d1
+; ARMV8M-NEXT:    vmov d1, r0, r1
+; ARMV8M-NEXT:    vminnm.f64 d1, d1, d2
+; ARMV8M-NEXT:    vminnm.f64 d0, d0, d3
+; ARMV8M-NEXT:    vmov r0, r1, d1
+; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan nsz <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y)
   ret <2 x double> %a
@@ -1020,31 +1004,23 @@ define <2 x double> @fminnumv264_non_zero_intrinsic(<2 x double> %x) {
 ;
 ; ARMV8-LABEL: fminnumv264_non_zero_intrinsic:
 ; ARMV8:       @ %bb.0:
-; ARMV8-NEXT:    vmov d17, r0, r1
 ; ARMV8-NEXT:    vmov.f64 d16, #1.000000e+00
-; ARMV8-NEXT:    vcmp.f64 d16, d17
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d18, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d16, d18
-; ARMV8-NEXT:    vselgt.f64 d17, d17, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d17
-; ARMV8-NEXT:    vselgt.f64 d16, d18, d16
+; ARMV8-NEXT:    vmov d18, r0, r1
+; ARMV8-NEXT:    vmov d17, r2, r3
+; ARMV8-NEXT:    vminnm.f64 d18, d18, d16
+; ARMV8-NEXT:    vminnm.f64 d16, d17, d16
+; ARMV8-NEXT:    vmov r0, r1, d18
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fminnumv264_non_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
-; ARMV8M-NEXT:    vmov d1, r0, r1
 ; ARMV8M-NEXT:    vmov.f64 d0, #1.000000e+00
-; ARMV8M-NEXT:    vcmp.f64 d0, d1
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov d2, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d0, d2
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
+; ARMV8M-NEXT:    vmov d2, r0, r1
+; ARMV8M-NEXT:    vmov d1, r2, r3
+; ARMV8M-NEXT:    vminnm.f64 d2, d2, d0
+; ARMV8M-NEXT:    vminnm.f64 d0, d1, d0
+; ARMV8M-NEXT:    vmov r0, r1, d2
 ; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double><double 1.0, double 1.0>)
@@ -1070,34 +1046,26 @@ define <2 x double> @fminnumv264_one_zero_intrinsic(<2 x double> %x) {
 ;
 ; ARMV8-LABEL: fminnumv264_one_zero_intrinsic:
 ; ARMV8:       @ %bb.0:
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d19, #0
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d18, r0, r1
 ; ARMV8-NEXT:    vmov.f64 d16, #-1.000000e+00
-; ARMV8-NEXT:    vcmp.f64 d16, d18
+; ARMV8-NEXT:    vmov d18, r0, r1
 ; ARMV8-NEXT:    vmov.i32 d17, #0x0
-; ARMV8-NEXT:    vmovlt.f64 d17, d19
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r2, r3, d17
-; ARMV8-NEXT:    vselgt.f64 d16, d18, d16
+; ARMV8-NEXT:    vminnm.f64 d16, d18, d16
+; ARMV8-NEXT:    vmov d19, r2, r3
+; ARMV8-NEXT:    vminnm.f64 d17, d19, d17
 ; ARMV8-NEXT:    vmov r0, r1, d16
+; ARMV8-NEXT:    vmov r2, r3, d17
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fminnumv264_one_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
-; ARMV8M-NEXT:    vmov d3, r2, r3
+; ARMV8M-NEXT:    vmov.f64 d0, #-1.000000e+00
 ; ARMV8M-NEXT:    vldr d1, .LCPI27_0
-; ARMV8M-NEXT:    vcmp.f64 d3, #0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
 ; ARMV8M-NEXT:    vmov d2, r0, r1
-; ARMV8M-NEXT:    vmov.f64 d0, #-1.000000e+00
-; ARMV8M-NEXT:    vcmp.f64 d0, d2
-; ARMV8M-NEXT:    vmovlt.f64 d1, d3
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r2, r3, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
+; ARMV8M-NEXT:    vmov d3, r2, r3
+; ARMV8M-NEXT:    vminnm.f64 d0, d2, d0
+; ARMV8M-NEXT:    vminnm.f64 d1, d3, d1
 ; ARMV8M-NEXT:    vmov r0, r1, d0
+; ARMV8M-NEXT:    vmov r2, r3, d1
 ; ARMV8M-NEXT:    bx lr
 ; ARMV8M-NEXT:    .p2align 3
 ; ARMV8M-NEXT:  @ %bb.1:
@@ -1129,31 +1097,23 @@ define <2 x double> @fmaxnumv264_intrinsic(<2 x double> %x, <2 x double> %y) {
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    mov r12, sp
 ; ARMV8-NEXT:    vld1.64 {d16, d17}, [r12]
-; ARMV8-NEXT:    vmov d18, r0, r1
-; ARMV8-NEXT:    vcmp.f64 d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d19, d17
-; ARMV8-NEXT:    vselgt.f64 d18, d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d18
-; ARMV8-NEXT:    vselgt.f64 d16, d19, d17
+; ARMV8-NEXT:    vmov d19, r0, r1
+; ARMV8-NEXT:    vmov d18, r2, r3
+; ARMV8-NEXT:    vmaxnm.f64 d19, d19, d16
+; ARMV8-NEXT:    vmaxnm.f64 d16, d18, d17
+; ARMV8-NEXT:    vmov r0, r1, d19
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fmaxnumv264_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    mov r12, sp
-; ARMV8M-NEXT:    vmov d1, r0, r1
-; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
 ; ARMV8M-NEXT:    vmov d0, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d1, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vcmp.f64 d0, d3
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
+; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
+; ARMV8M-NEXT:    vmov d1, r0, r1
+; ARMV8M-NEXT:    vmaxnm.f64 d1, d1, d2
+; ARMV8M-NEXT:    vmaxnm.f64 d0, d0, d3
 ; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d0, d3
 ; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y)
@@ -1181,31 +1141,23 @@ define <2 x double> @fmaxnumv264_nsz_intrinsic(<2 x double> %x, <2 x double> %y)
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    mov r12, sp
 ; ARMV8-NEXT:    vld1.64 {d16, d17}, [r12]
-; ARMV8-NEXT:    vmov d18, r0, r1
-; ARMV8-NEXT:    vcmp.f64 d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d19, d17
-; ARMV8-NEXT:    vselgt.f64 d18, d18, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d18
-; ARMV8-NEXT:    vselgt.f64 d16, d19, d17
+; ARMV8-NEXT:    vmov d19, r0, r1
+; ARMV8-NEXT:    vmov d18, r2, r3
+; ARMV8-NEXT:    vmaxnm.f64 d19, d19, d16
+; ARMV8-NEXT:    vmaxnm.f64 d16, d18, d17
+; ARMV8-NEXT:    vmov r0, r1, d19
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fmaxnumv264_nsz_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    mov r12, sp
-; ARMV8M-NEXT:    vmov d1, r0, r1
-; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
 ; ARMV8M-NEXT:    vmov d0, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d1, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vcmp.f64 d0, d3
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d2
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
+; ARMV8M-NEXT:    vldrw.u32 q1, [r12]
+; ARMV8M-NEXT:    vmov d1, r0, r1
+; ARMV8M-NEXT:    vmaxnm.f64 d1, d1, d2
+; ARMV8M-NEXT:    vmaxnm.f64 d0, d0, d3
 ; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d0, d3
 ; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan nsz <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y)
@@ -1236,18 +1188,14 @@ define <2 x double> @fmaxnumv264_zero_intrinsic(<2 x double> %x) {
 ;
 ; ARMV8-LABEL: fmaxnumv264_zero_intrinsic:
 ; ARMV8:       @ %bb.0:
-; ARMV8-NEXT:    vmov d18, r0, r1
 ; ARMV8-NEXT:    vldr d16, .LCPI30_0
-; ARMV8-NEXT:    vcmp.f64 d18, #0
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d19, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d19, d16
+; ARMV8-NEXT:    vmov d18, r2, r3
 ; ARMV8-NEXT:    vmov.i32 d17, #0x0
-; ARMV8-NEXT:    vselgt.f64 d17, d18, d17
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d17
-; ARMV8-NEXT:    vselgt.f64 d16, d19, d16
+; ARMV8-NEXT:    vmov d19, r0, r1
+; ARMV8-NEXT:    vmaxnm.f64 d16, d18, d16
+; ARMV8-NEXT:    vmaxnm.f64 d17, d19, d17
 ; ARMV8-NEXT:    vmov r2, r3, d16
+; ARMV8-NEXT:    vmov r0, r1, d17
 ; ARMV8-NEXT:    bx lr
 ; ARMV8-NEXT:    .p2align 3
 ; ARMV8-NEXT:  @ %bb.1:
@@ -1257,18 +1205,14 @@ define <2 x double> @fmaxnumv264_zero_intrinsic(<2 x double> %x) {
 ;
 ; ARMV8M-LABEL: fmaxnumv264_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
-; ARMV8M-NEXT:    vmov d2, r0, r1
 ; ARMV8M-NEXT:    vldr d0, .LCPI30_0
-; ARMV8M-NEXT:    vcmp.f64 d2, #0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov d3, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d3, d0
+; ARMV8M-NEXT:    vmov d2, r2, r3
 ; ARMV8M-NEXT:    vldr d1, .LCPI30_1
-; ARMV8M-NEXT:    vselgt.f64 d1, d2, d1
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d3, d0
+; ARMV8M-NEXT:    vmov d3, r0, r1
+; ARMV8M-NEXT:    vmaxnm.f64 d0, d2, d0
+; ARMV8M-NEXT:    vmaxnm.f64 d1, d3, d1
 ; ARMV8M-NEXT:    vmov r2, r3, d0
+; ARMV8M-NEXT:    vmov r0, r1, d1
 ; ARMV8M-NEXT:    bx lr
 ; ARMV8M-NEXT:    .p2align 3
 ; ARMV8M-NEXT:  @ %bb.1:
@@ -1307,15 +1251,11 @@ define <2 x double> @fmaxnumv264_minus_zero_intrinsic(<2 x double> %x) {
 ; ARMV8-LABEL: fmaxnumv264_minus_zero_intrinsic:
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    vldr d16, .LCPI31_0
-; ARMV8-NEXT:    vmov d17, r0, r1
-; ARMV8-NEXT:    vmov d18, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d17, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vcmp.f64 d18, d16
-; ARMV8-NEXT:    vselgt.f64 d17, d17, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d17
-; ARMV8-NEXT:    vselgt.f64 d16, d18, d16
+; ARMV8-NEXT:    vmov d18, r0, r1
+; ARMV8-NEXT:    vmov d17, r2, r3
+; ARMV8-NEXT:    vmaxnm.f64 d18, d18, d16
+; ARMV8-NEXT:    vmaxnm.f64 d16, d17, d16
+; ARMV8-NEXT:    vmov r0, r1, d18
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ; ARMV8-NEXT:    .p2align 3
@@ -1327,15 +1267,11 @@ define <2 x double> @fmaxnumv264_minus_zero_intrinsic(<2 x double> %x) {
 ; ARMV8M-LABEL: fmaxnumv264_minus_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    vldr d0, .LCPI31_0
-; ARMV8M-NEXT:    vmov d1, r0, r1
-; ARMV8M-NEXT:    vmov d2, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d1, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vcmp.f64 d2, d0
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
+; ARMV8M-NEXT:    vmov d2, r0, r1
+; ARMV8M-NEXT:    vmov d1, r2, r3
+; ARMV8M-NEXT:    vmaxnm.f64 d2, d2, d0
+; ARMV8M-NEXT:    vmaxnm.f64 d0, d1, d0
+; ARMV8M-NEXT:    vmov r0, r1, d2
 ; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
 ; ARMV8M-NEXT:    .p2align 3
@@ -1367,30 +1303,22 @@ define <2 x double> @fmaxnumv264_non_zero_intrinsic(<2 x double> %x) {
 ; ARMV8-LABEL: fmaxnumv264_non_zero_intrinsic:
 ; ARMV8:       @ %bb.0:
 ; ARMV8-NEXT:    vmov.f64 d16, #1.000000e+00
-; ARMV8-NEXT:    vmov d17, r0, r1
-; ARMV8-NEXT:    vcmp.f64 d17, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov d18, r2, r3
-; ARMV8-NEXT:    vcmp.f64 d18, d16
-; ARMV8-NEXT:    vselgt.f64 d17, d17, d16
-; ARMV8-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8-NEXT:    vmov r0, r1, d17
-; ARMV8-NEXT:    vselgt.f64 d16, d18, d16
+; ARMV8-NEXT:    vmov d18, r0, r1
+; ARMV8-NEXT:    vmov d17, r2, r3
+; ARMV8-NEXT:    vmaxnm.f64 d18, d18, d16
+; ARMV8-NEXT:    vmaxnm.f64 d16, d17, d16
+; ARMV8-NEXT:    vmov r0, r1, d18
 ; ARMV8-NEXT:    vmov r2, r3, d16
 ; ARMV8-NEXT:    bx lr
 ;
 ; ARMV8M-LABEL: fmaxnumv264_non_zero_intrinsic:
 ; ARMV8M:       @ %bb.0:
 ; ARMV8M-NEXT:    vmov.f64 d0, #1.000000e+00
-; ARMV8M-NEXT:    vmov d1, r0, r1
-; ARMV8M-NEXT:    vcmp.f64 d1, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov d2, r2, r3
-; ARMV8M-NEXT:    vcmp.f64 d2, d0
-; ARMV8M-NEXT:    vselgt.f64 d1, d1, d0
-; ARMV8M-NEXT:    vmrs APSR_nzcv, fpscr
-; ARMV8M-NEXT:    vmov r0, r1, d1
-; ARMV8M-NEXT:    vselgt.f64 d0, d2, d0
+; ARMV8M-NEXT:    vmov d2, r0, r1
+; ARMV8M-NEXT:    vmov d1, r2, r3
+; ARMV8M-NEXT:    vmaxnm.f64 d2, d2, d0
+; ARMV8M-NEXT:    vmaxnm.f64 d0, d1, d0
+; ARMV8M-NEXT:    vmov r0, r1, d2
 ; ARMV8M-NEXT:    vmov r2, r3, d0
 ; ARMV8M-NEXT:    bx lr
   %a = call nnan <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double><double 1.0, double 1.0>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index 7cafb7262f460d..20a90033659f64 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -43,21 +43,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vcmp.f32 s5, s1
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f32 s4, s0
-; CHECK-NOFP-NEXT:    vselgt.f32 s8, s1, s5
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f32 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f32 s7, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s8, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s8
-; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s3, s7
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
@@ -129,44 +121,28 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
-; CHECK-NOFP-NEXT:    vcmp.f16 s8, s10
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
-; CHECK-NOFP-NEXT:    vselgt.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s5, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s8
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s4, s8, s4
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s8, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s2, s6
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s3, s7
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
-; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s4, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -196,12 +172,8 @@ entry:
 define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) {
 ; CHECK-LABEL: fmin_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcmp.f64 d3, d1
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vcmp.f64 d2, d0
-; CHECK-NEXT:    vselgt.f64 d1, d1, d3
-; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NEXT:    vselgt.f64 d0, d0, d2
+; CHECK-NEXT:    vminnm.f64 d1, d1, d3
+; CHECK-NEXT:    vminnm.f64 d0, d0, d2
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -435,21 +407,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vcmp.f32 s5, s1
-; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f32 s4, s0
-; CHECK-NOFP-NEXT:    ...
[truncated]

@wzssyqa wzssyqa force-pushed the createSelectForFMINNUM_FMAXNUM branch from 875e7f9 to 9c70f3d Compare September 24, 2024 00:44
@wzssyqa wzssyqa changed the title SelectionDAG: createSelectForFMINNUM_FMAXNUM skips vector if Op is legal for elements SelectionDAG/expandFMINNUM_FMAXNUM: skips vector if Op is legal for elements Sep 24, 2024
@wzssyqa wzssyqa requested a review from arsenm September 24, 2024 00:46
@arsenm arsenm requested a review from RKSimon September 24, 2024 10:22
@arsenm
Copy link
Contributor

arsenm commented Sep 24, 2024

We have really bad vector splitting logic if the vector type is legal. The correct vector splitting only happens by default for illegal vector types to narrower legal vector types in the vector legalization. Most legal expansions go straight to scalarization, which this also does. This is fine, but most of the other instances I see check based on which vector operations are legal, not that the scalar operation is legal. For consistency it's probably best to check the legality of the vector expansions

@wzssyqa wzssyqa force-pushed the createSelectForFMINNUM_FMAXNUM branch from 9c70f3d to e2e180b Compare October 6, 2024 15:23
@wzssyqa wzssyqa requested a review from arsenm October 6, 2024 15:24
@wzssyqa wzssyqa changed the title SelectionDAG/expandFMINNUM_FMAXNUM: skips vector if Op is legal for elements SelectionDAG/createSelectForFMINNUM_FMAXNUM: skips vector if SETCC/VSELECT is not legal while Op is legal for elements Oct 6, 2024
@wzssyqa wzssyqa force-pushed the createSelectForFMINNUM_FMAXNUM branch from e2e180b to cc2f403 Compare October 9, 2024 08:15
@wzssyqa wzssyqa changed the title SelectionDAG/createSelectForFMINNUM_FMAXNUM: skips vector if SETCC/VSELECT is not legal while Op is legal for elements SelectionDAG/expandFMINNUM_FMAXNUM: skips vector if SETCC/VSELECT is not legal Oct 9, 2024
…not legal

If SETCC or VSELECT is not legal for vector, we should not expand it,
instead we can split the vectors.

So that, some simple scale instructions can be emitted instead of
some pairs of comparation+selection.
@wzssyqa wzssyqa force-pushed the createSelectForFMINNUM_FMAXNUM branch from cc2f403 to d50cafd Compare October 9, 2024 11:01
@wzssyqa wzssyqa merged commit d52c840 into llvm:main Oct 10, 2024
10 checks passed
DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this pull request Oct 16, 2024
…not legal (llvm#109570)

If SETCC or VSELECT is not legal for vector, we should not expand it,
instead we can split the vectors.

So that, some simple scale instructions can be emitted instead of
some pairs of comparation+selection.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants