[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

phoebewang · 2025-06-06T09:44:33Z

Fixes: https://godbolt.org/z/7jYa3bWK9

llvmbot · 2025-06-06T09:45:14Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Fixes: https://godbolt.org/z/7jYa3bWK9

Patch is 76.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143100.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+8-1)
(modified) llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll (+435-40)
(modified) llvm/test/CodeGen/X86/avx512fp16-fminnum.ll (+435-40)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e929dab429de5..1555b8a669ae8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55357,10 +55357,17 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
+  auto IsMinMaxLegal = [&](EVT VT) {
+    if (!TLI.isTypeLegal(VT))
+      return false;
+    return VT.getScalarType() != MVT::f16 ||
+           (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
+  };
+
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
         (Subtarget.hasFP16() && VT == MVT::f16) ||
-        (VT.isVector() && TLI.isTypeLegal(VT))))
+        (VT.isVector() && IsMinMaxLegal(VT))))
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
index 1d535f93bc867..eac803f83e863 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,avx512vl    | FileCheck %s --check-prefixes=CHECK,HasVL
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK,NOVL
 
 declare half @llvm.maxnum.f16(half, half)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
@@ -9,61 +10,397 @@ declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
 
 define half @test_intrinsic_fmaxh(half %x, half %y) {
-; CHECK-LABEL: test_intrinsic_fmaxh:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmaxh:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; HasVL-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmaxh:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; NOVL-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call half @llvm.maxnum.f16(half %x, half %y) readnone
   ret half %z
 }
 
 define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v2f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v2f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
   ret <2 x half> %z
 }
 
 define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v4f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v4f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v4f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vmovshdup %xmm0, %xmm3 # encoding: [0xc5,0xfa,0x16,0xd8]
+; NOVL-NEXT:    # xmm3 = xmm0[1,1,3,3]
+; NOVL-NEXT:    vmovshdup %xmm1, %xmm4 # encoding: [0xc5,0xfa,0x16,0xe1]
+; NOVL-NEXT:    # xmm4 = xmm1[1,1,3,3]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm4 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe0]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe1]
+; NOVL-NEXT:    vpsrld $16, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xd0,0x10]
+; NOVL-NEXT:    vpsrld $16, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x72,0xd1,0x10]
+; NOVL-NEXT:    vmaxsh %xmm0, %xmm1, %xmm5 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xe8]
+; NOVL-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; NOVL-NEXT:    vmovsh %xmm1, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe9]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm4, %xmm0 # encoding: [0xc5,0xd9,0x61,0xc5]
+; NOVL-NEXT:    # xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm3, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x62,0xc3]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; NOVL-NEXT:    vpunpcklqdq %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x6c,0xc2]
+; NOVL-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; NOVL-NEXT:    retq # encoding: [0xc3]
   %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
   ret <4 x half> %z
 }
 
 define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) {
-; CHECK-LABEL: test_intrinsic_fmax_v8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
-; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
-; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
-; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; HasVL-LABEL: test_intrinsic_fmax_v8f16:
+; HasVL:       # %bb.0:
+; HasVL-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; HasVL-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; HasVL-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; HasVL-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; HasVL-NEXT:    retq # encoding: [0xc3]
+;
+; NOVL-LABEL: test_intrinsic_fmax_v8f16:
+; NOVL:       # %bb.0:
+; NOVL-NEXT:    vpsrldq $14, %xmm0, %xmm2 # encoding: [0xc5,0xe9,0x73,0xd8,0x0e]
+; NOVL-NEXT:    # xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $14, %xmm1, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd9,0x0e]
+; NOVL-NEXT:    # xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm4 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xe2]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xe3]
+; NOVL-NEXT:    vshufps $255, %xmm0, %xmm0, %xmm2 # encoding: [0xc5,0xf8,0xc6,0xd0,0xff]
+; NOVL-NEXT:    # xmm2 = xmm0[3,3,3,3]
+; NOVL-NEXT:    vpshufd $255, %xmm1, %xmm3 # encoding: [0xc5,0xf9,0x70,0xd9,0xff]
+; NOVL-NEXT:    # xmm3 = xmm1[3,3,3,3]
+; NOVL-NEXT:    vmaxsh %xmm2, %xmm3, %xmm5 # encoding: [0x62,0xf5,0x66,0x08,0x5f,0xea]
+; NOVL-NEXT:    vcmpunordsh %xmm2, %xmm2, %k1 # encoding: [0x62,0xf3,0x6e,0x08,0xc2,0xca,0x03]
+; NOVL-NEXT:    vmovsh %xmm3, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xeb]
+; NOVL-NEXT:    vpunpcklwd %xmm4, %xmm5, %xmm2 # encoding: [0xc5,0xd1,0x61,0xd4]
+; NOVL-NEXT:    # xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; NOVL-NEXT:    vpsrldq $10, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd8,0x0a]
+; NOVL-NEXT:    # xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vpsrldq $10, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd9,0x0a]
+; NOVL-NEXT:    # xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm5 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xeb]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm5 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xec]
+; NOVL-NEXT:    vshufpd $1, %xmm0, %xmm0, %xmm3 # encoding: [0xc5,0xf9,0xc6,0xd8,0x01]
+; NOVL-NEXT:    # xmm3 = xmm0[1,0]
+; NOVL-NEXT:    vshufpd $1, %xmm1, %xmm1, %xmm4 # encoding: [0xc5,0xf1,0xc6,0xe1,0x01]
+; NOVL-NEXT:    # xmm4 = xmm1[1,0]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm6 # encoding: [0x62,0xf5,0x5e,0x08,0x5f,0xf3]
+; NOVL-NEXT:    vcmpunordsh %xmm3, %xmm3, %k1 # encoding: [0x62,0xf3,0x66,0x08,0xc2,0xcb,0x03]
+; NOVL-NEXT:    vmovsh %xmm4, %xmm0, %xmm6 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xf4]
+; NOVL-NEXT:    vpunpcklwd %xmm5, %xmm6, %xmm3 # encoding: [0xc5,0xc9,0x61,0xdd]
+; NOVL-NEXT:    # xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; NOVL-NEXT:    vpunpckldq %xmm2, %xmm3, %xmm2 # encoding: [0xc5,0xe1,0x62,0xd2]
+; NOVL-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; NOVL-NEXT:    vpsrlq $48, %xmm0, %xmm3 # encoding: [0xc5,0xe1,0x73,0xd0,0x30]
+; NOVL-NEXT:    vpsrlq $48, %xmm1, %xmm4 # encoding: [0xc5,0xd9,0x73,0xd1,0x30]
+; NOVL-NEXT:    vmaxsh %xmm3, %xmm4, %xmm...
[truncated]

RKSimon · 2025-06-06T10:23:15Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+      return false;
+    return VT.getScalarType() != MVT::f16 ||
+           (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
+  };


why not just widen (with zeros) to v32f16?

There's a blocker issue. The general combiner always combines extract_subvector(insert_subvector(BinOP X, Y)) to BinOP X, Y. I created #143298 to show the problem.

OTOH, the AVX512FP16 w/o AVX512VL case doesn't occur in any real HW. We just need to make sure no crash here. Performance is not a concern.

RKSimon

lgtm

…ed (llvm#143100) Fixes: https://godbolt.org/z/7jYa3bWK9

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled

03068fa

Fixes: https://godbolt.org/z/7jYa3bWK9

phoebewang requested review from KanRobert, RKSimon and e-kud June 6, 2025 09:44

llvmbot added the backend:X86 label Jun 6, 2025

RKSimon reviewed Jun 6, 2025

View reviewed changes

RKSimon approved these changes Jun 8, 2025

View reviewed changes

phoebewang merged commit 4fbf67f into llvm:main Jun 9, 2025
9 checks passed

phoebewang deleted the minmax branch June 9, 2025 00:35

tomtor pushed a commit to tomtor/llvm-project that referenced this pull request Jun 14, 2025

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabl…

6888e08

…ed (llvm#143100) Fixes: https://godbolt.org/z/7jYa3bWK9

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

Uh oh!

phoebewang commented Jun 6, 2025

Uh oh!

llvmbot commented Jun 6, 2025

Uh oh!

RKSimon Jun 6, 2025

Uh oh!

phoebewang Jun 8, 2025

Uh oh!

RKSimon left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

[X86][FP16] Do not generate X86 FMIN/FMAX for FP16 when VLX not enabled #143100

Uh oh!

Conversation

phoebewang commented Jun 6, 2025

Uh oh!

llvmbot commented Jun 6, 2025

Uh oh!

RKSimon Jun 6, 2025

Choose a reason for hiding this comment

Uh oh!

phoebewang Jun 8, 2025

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants