From ce883769a11c476ac90807e602a98e6f2a4329b6 Mon Sep 17 00:00:00 2001 From: "Wang, Phoebe" Date: Fri, 4 Jul 2025 11:44:16 +0800 Subject: [PATCH] [X86][FP16] Do not customize WidenLowerNode for half if VLX not enabled The #142763 tried to reuse ISD node to workaround the non-VLX lowering problem, but it caused a new problem: https://godbolt.org/z/eWv1hToa3 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- llvm/test/CodeGen/X86/avx512fp16-cvt.ll | 18 ++ .../X86/vec-strict-fptoint-128-fp16.ll | 158 ++++++++++-------- 3 files changed, 110 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ced29f8fb3d0c..274ad3c3cd53a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34150,7 +34150,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (VT.isVector() && Subtarget.hasFP16() && + if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() && SrcVT.getVectorElementType() == MVT::f16) { EVT EleVT = VT.getVectorElementType(); EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; @@ -34164,10 +34164,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } if (IsStrict) { + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); } else { + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; Res = DAG.getNode(Opc, dl, ResVT, Src); } diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll index 3f6ddc6ecfd70..f66f0c0ceabc4 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -1054,4 +1054,22 @@ define <8 x half> @u64tof16(<8 x i64> %a) #0 { ret <8 x half> %1 } +define void @f16tou32(ptr %ptr) { +; X64-LABEL: f16tou32: +; X64: # %bb.0: +; X64-NEXT: vcvttph2udq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0 +; X64-NEXT: vmovlps %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: f16tou32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcvttph2udq {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0 +; X86-NEXT: vmovlps %xmm0, (%eax) +; X86-NEXT: retl + %1 = fptoui <2 x half> splat (half 0xH7E00) to <2 x i32> + store <2 x i32> %1, ptr %ptr, align 8 + ret void +} + attributes #0 = { "min-legal-vector-width"="256" "prefer-vector-width"="256" } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll index bde14e75dfc04..6aad9a6d82c73 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll @@ -82,13 +82,11 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i32: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOVL-NEXT: vcvttph2dq %ymm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -105,13 +103,11 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i32: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; NOVL-NEXT: vcvttph2udq %ymm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2usi %xmm1, %eax +; NOVL-NEXT: vcvttsh2usi %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -128,13 +124,12 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i16: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; NOVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -151,13 +146,12 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i16: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; NOVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -175,13 +169,13 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i8: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; NOVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -199,13 +193,13 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i8: ; NOVL: # %bb.0: -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm0 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; NOVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half> %a, metadata !"fpexcept.strict") #0 @@ -335,12 +329,18 @@ define <4 x i16> @strict_vector_fptosi_v4f16_to_v4i16(<4 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i16: ; NOVL: # %bb.0: -; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm1 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; NOVL-NEXT: vcvttsh2si %xmm2, %eax +; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0 +; NOVL-NEXT: vcvttsh2si %xmm0, %eax +; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; NOVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half> %a, metadata !"fpexcept.strict") #0 @@ -356,12 +356,18 @@ define <4 x i16> @strict_vector_fptoui_v4f16_to_v4i16(<4 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i16: ; NOVL: # %bb.0: -; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm1 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; NOVL-NEXT: vcvttsh2si %xmm2, %eax +; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0 +; NOVL-NEXT: vcvttsh2si %xmm0, %eax +; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; NOVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half> %a, metadata !"fpexcept.strict") #0 @@ -378,12 +384,19 @@ define <4 x i8> @strict_vector_fptosi_v4f16_to_v4i8(<4 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptosi_v4f16_to_v4i8: ; NOVL: # %bb.0: -; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm1 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; NOVL-NEXT: vcvttsh2si %xmm2, %eax +; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0 +; NOVL-NEXT: vcvttsh2si %xmm0, %eax +; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; NOVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half> %a, metadata !"fpexcept.strict") #0 @@ -400,12 +413,19 @@ define <4 x i8> @strict_vector_fptoui_v4f16_to_v4i8(<4 x half> %a) #0 { ; ; NOVL-LABEL: strict_vector_fptoui_v4f16_to_v4i8: ; NOVL: # %bb.0: -; NOVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; NOVL-NEXT: vzeroupper +; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; NOVL-NEXT: vcvttsh2si %xmm1, %eax +; NOVL-NEXT: vcvttsh2si %xmm0, %ecx +; NOVL-NEXT: vmovd %ecx, %xmm1 +; NOVL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; NOVL-NEXT: vcvttsh2si %xmm2, %eax +; NOVL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; NOVL-NEXT: vpsrlq $48, %xmm0, %xmm0 +; NOVL-NEXT: vcvttsh2si %xmm0, %eax +; NOVL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; NOVL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: retq %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half> %a, metadata !"fpexcept.strict") #0 @@ -554,7 +574,7 @@ define <8 x i8> @strict_vector_fptosi_v8f16_to_v8i8(<8 x half> %a) #0 { ; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; NOVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half> %a, @@ -573,8 +593,8 @@ define <8 x i8> @strict_vector_fptoui_v8f16_to_v8i8(<8 x half> %a) #0 { ; NOVL: # %bb.0: ; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 -; NOVL-NEXT: vcvttph2uw %zmm0, %zmm0 -; NOVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; NOVL-NEXT: vcvttph2w %zmm0, %zmm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half> %a,