From 9f62f4105035091b57c6912a22f3e0d7f72bdf2b Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 03:19:13 +0530 Subject: [PATCH 01/11] Update LegalizeVectorOps.cpp --- .../SelectionDAG/LegalizeVectorOps.cpp | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 154c8aea6bcd1..2be9239202b02 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1776,6 +1776,27 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, assert((BW == 64 || BW == 32) && "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide"); + // If STRICT_/FMUL is not supported by the target (in case of f16) replace the + // UINT_TO_FP with a larger float and round to the smaller type + if ((!IsStrict && TLI.getOperationAction(ISD::FMUL, Node->getValueType(0)) == + TargetLowering::Expand) || + (IsStrict && + TLI.getOperationAction(ISD::STRICT_FMUL, Node->getValueType(0)) == + TargetLowering::Expand)) { + EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; + SDLoc DL(Node); + unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND; + unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP; + SDValue Result = DAG.getNode( + Round, DL, Node->getValueType(0), + DAG.getNode(UIToFP, DL, VT.changeVectorElementType(FPVT), Src), + DAG.getTargetConstant( + 0, DL, + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))); + Results.push_back(Result); + return; + } + SDValue HalfWord = DAG.getConstant(BW / 2, DL, VT); // Constants to clear the upper part of the word. From 74f53279c202d08e0442ae3681904d583209264a Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 03:38:22 +0530 Subject: [PATCH 02/11] Update LegalizeVectorOps.cpp --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 966c9fff5750d..17605eb503468 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1779,18 +1779,17 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, // If STRICT_/FMUL is not supported by the target (in case of f16) replace the // UINT_TO_FP with a larger float and round to the smaller type - if ((!IsStrict && TLI.getOperationAction(ISD::FMUL, Node->getValueType(0)) == - TargetLowering::Expand) || - (IsStrict && - TLI.getOperationAction(ISD::STRICT_FMUL, Node->getValueType(0)) == - TargetLowering::Expand)) { + if ((!IsStrict && + TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand) || + (IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == + TargetLowering::Expand)) { EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; SDLoc DL(Node); unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND; unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP; SDValue Result = DAG.getNode( - Round, DL, Node->getValueType(0), - DAG.getNode(UIToFP, DL, VT.changeVectorElementType(FPVT), Src), + Round, DL, DstVT, + DAG.getNode(UIToFP, DL, SrcVT.changeVectorElementType(FPVT), Src), DAG.getTargetConstant( 0, DL, DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))); From 40ca7cf0f200e977906d198c176d2f16e76cb8e3 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 13:38:23 +0530 Subject: [PATCH 03/11] Update LegalizeVectorOps.cpp --- .../SelectionDAG/LegalizeVectorOps.cpp | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 17605eb503468..33621201fe1ef 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1784,16 +1784,24 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, (IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == TargetLowering::Expand)) { EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; - SDLoc DL(Node); - unsigned Round = IsStrict ? ISD::STRICT_FP_ROUND : ISD::FP_ROUND; - unsigned UIToFP = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP; - SDValue Result = DAG.getNode( - Round, DL, DstVT, - DAG.getNode(UIToFP, DL, SrcVT.changeVectorElementType(FPVT), Src), - DAG.getTargetConstant( - 0, DL, - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))); - Results.push_back(Result); + SDValue UIToFP; + SDValue Result; + SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true); + if (IsStrict) { + UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, + {SrcVT.changeVectorElementType(FPVT), MVT::Other}, + {Node->getOperand(0), Src}); + Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other}, + {Node->getOperand(0), UIToFP, TargetZero}); + Results.push_back(Result); + Results.push_back(Result.getValue(1)); + } else { + UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, + SrcVT.changeVectorElementType(FPVT), Src); + Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero); + Results.push_back(Result); + } + return; } From 9a519bc9ac6a09c37485f30d5009e5b98bfb0636 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 13:58:57 +0530 Subject: [PATCH 04/11] Update LegalizeVectorOps.cpp --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 33621201fe1ef..1cbee72cfc7ce 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1787,17 +1787,16 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, SDValue UIToFP; SDValue Result; SDValue TargetZero = DAG.getIntPtrConstant(0, DL, /*isTarget=*/true); + EVT FloatVecVT = SrcVT.changeVectorElementType(FPVT); if (IsStrict) { - UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, - {SrcVT.changeVectorElementType(FPVT), MVT::Other}, + UIToFP = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {FloatVecVT, MVT::Other}, {Node->getOperand(0), Src}); Result = DAG.getNode(ISD::STRICT_FP_ROUND, DL, {DstVT, MVT::Other}, {Node->getOperand(0), UIToFP, TargetZero}); Results.push_back(Result); Results.push_back(Result.getValue(1)); } else { - UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, - SrcVT.changeVectorElementType(FPVT), Src); + UIToFP = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVecVT, Src); Result = DAG.getNode(ISD::FP_ROUND, DL, DstVT, UIToFP, TargetZero); Results.push_back(Result); } From eb53636a01b4534c244c07bbf6a27ed890fdf5b2 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 14:18:16 +0530 Subject: [PATCH 05/11] Add test with auto assertions --- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 404 ++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll new file mode 100644 index 0000000000000..07296068d5182 --- /dev/null +++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s + +define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { +; CHECK-LABEL: test_UINT_TO_FP_no_inf8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %vec = uitofp <8 x i32> %a to <8 x half> + ret <8 x half> %vec +} + +define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { +; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $88, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f34.i34(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <8 x half> %vec +} + +define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { +; CHECK-LABEL: test_UINT_TO_FP_no_inf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %vec = uitofp <16 x i32> %a to <16 x half> + ret <16 x half> %vec +} + +define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { +; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 128 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f34.i34(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <16 x half> %vec +} From cab2613d61515405314ad0b5544a544caf721309 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 14:38:32 +0530 Subject: [PATCH 06/11] Update test --- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 789 +++++++++--------- 1 file changed, 408 insertions(+), 381 deletions(-) diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll index 07296068d5182..90ec157359536 100644 --- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll +++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll @@ -1,404 +1,431 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512 define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { -; CHECK-LABEL: test_UINT_TO_FP_no_inf8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $88, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; AVX-LABEL: test_UINT_TO_FP_no_inf8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 96 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_UINT_TO_FP_no_inf8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %vec = uitofp <8 x i32> %a to <8 x half> ret <8 x half> %vec } define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { -; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: addq $88, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 96 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: - %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f34.i34(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <8 x half> %vec } define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { -; CHECK-LABEL: test_UINT_TO_FP_no_inf16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $120, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $120, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; AVX-LABEL: test_UINT_TO_FP_no_inf16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $120, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 128 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $120, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_UINT_TO_FP_no_inf16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq entry: %vec = uitofp <16 x i32> %a to <16 x half> ret <16 x half> %vec } define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { -; CHECK-LABEL: test_STRICT_UINT_TO_FP_no_inf16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: subq $120, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; CHECK-NEXT: vpsrld $16, %xmm2, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm1 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1 -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; CHECK-NEXT: addq $120, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $120, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 128 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __truncsfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $120, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +; +; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq entry: - %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f34.i34(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <16 x half> %vec } From 96707b1886ca2a675087293f09af18f498f2cf84 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 14:40:56 +0530 Subject: [PATCH 07/11] Fix indentation --- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll index 90ec157359536..43517b3871cf8 100644 --- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll +++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll @@ -75,8 +75,8 @@ define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: - %vec = uitofp <8 x i32> %a to <8 x half> - ret <8 x half> %vec + %vec = uitofp <8 x i32> %a to <8 x half> + ret <8 x half> %vec } define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { @@ -152,8 +152,8 @@ define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: - %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") - ret <8 x half> %vec + %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <8 x half> %vec } define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { @@ -289,8 +289,8 @@ define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { ; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512-NEXT: retq entry: - %vec = uitofp <16 x i32> %a to <16 x half> - ret <16 x half> %vec + %vec = uitofp <16 x i32> %a to <16 x half> + ret <16 x half> %vec } define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { @@ -426,6 +426,6 @@ define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { ; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512-NEXT: retq entry: - %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") - ret <16 x half> %vec + %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <16 x half> %vec } From d7809f2e04db758093380802cf3ba17c8aed8b23 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 16:07:11 +0530 Subject: [PATCH 08/11] Update test with `+f16c` --- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 402 +++--------------- 1 file changed, 61 insertions(+), 341 deletions(-) diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll index 43517b3871cf8..74d2e042c1090 100644 --- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll +++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll @@ -1,71 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s -check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512 define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX-LABEL: test_UINT_TO_FP_no_inf8: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $88, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 96 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 ; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $88, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_UINT_TO_FP_no_inf8: @@ -82,8 +38,6 @@ entry: define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $88, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 96 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 @@ -93,56 +47,8 @@ define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: addq $88, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8: @@ -159,128 +65,41 @@ entry: define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { ; AVX-LABEL: test_UINT_TO_FP_no_inf16: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $120, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 128 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm3 +; AVX-NEXT: vcvtdq2ps %ymm3, %ymm3 +; AVX-NEXT: vcvtps2ph $4, %ymm3, %xmm3 +; AVX-NEXT: vcvtph2ps %xmm3, %ymm3 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] +; AVX-NEXT: vmulps %ymm4, %ymm0, %ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX-NEXT: vaddps %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vcvtps2ph $4, %ymm2, %xmm2 +; AVX-NEXT: vcvtph2ps %xmm2, %ymm2 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $120, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 +; AVX-NEXT: vmulps %ymm4, %ymm1, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_UINT_TO_FP_no_inf16: @@ -296,128 +115,29 @@ entry: define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { ; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16: ; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $120, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 128 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vpsrld $16, %xmm2, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,0] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[1,1,3,3] -; AVX-NEXT: callq __truncsfhf2@PLT -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $120, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16: From edb53f6723fca777543c8e9c9ed48fab70e57ff0 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 16:54:01 +0530 Subject: [PATCH 09/11] Fix `+f16c` case --- .../SelectionDAG/LegalizeVectorOps.cpp | 9 ++-- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 52 ++++++------------- 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 1cbee72cfc7ce..234dbacb6d2df 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1780,9 +1780,12 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, // If STRICT_/FMUL is not supported by the target (in case of f16) replace the // UINT_TO_FP with a larger float and round to the smaller type if ((!IsStrict && - TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand) || - (IsStrict && TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == - TargetLowering::Expand)) { + (TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Promote)) || + (IsStrict && (TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == + TargetLowering::Expand || + TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == + TargetLowering::Promote))) { EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; SDValue UIToFP; SDValue Result; diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll index 74d2e042c1090..76eecdd72f882 100644 --- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll +++ b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll @@ -10,15 +10,9 @@ define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 ; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; AVX-NEXT: vzeroupper @@ -65,39 +59,27 @@ entry: define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { ; AVX-LABEL: test_UINT_TO_FP_no_inf16: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm3 -; AVX-NEXT: vcvtdq2ps %ymm3, %ymm3 -; AVX-NEXT: vcvtps2ph $4, %ymm3, %xmm3 -; AVX-NEXT: vcvtph2ps %xmm3, %ymm3 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] -; AVX-NEXT: vmulps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX-NEXT: vaddps %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vandps %ymm2, %ymm1, %ymm2 +; AVX-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX-NEXT: vcvtps2ph $4, %ymm2, %xmm2 -; AVX-NEXT: vcvtph2ps %xmm2, %ymm2 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm3 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 -; AVX-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 -; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: retq From 25850f3f5181c7bc51dc1c927867c7fbaf018410 Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 19:28:19 +0530 Subject: [PATCH 10/11] Address review comments --- .../SelectionDAG/LegalizeVectorOps.cpp | 9 +- .../X86/test_UINT_TO_FP_no_inf_corei7_avx.ll | 133 ------------ llvm/test/CodeGen/X86/uint_to_half.ll | 200 ++++++++++++++++++ 3 files changed, 202 insertions(+), 140 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll create mode 100644 llvm/test/CodeGen/X86/uint_to_half.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 234dbacb6d2df..89a00c5a4f043 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1779,13 +1779,8 @@ void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node, // If STRICT_/FMUL is not supported by the target (in case of f16) replace the // UINT_TO_FP with a larger float and round to the smaller type - if ((!IsStrict && - (TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Expand || - TLI.getOperationAction(ISD::FMUL, DstVT) == TargetLowering::Promote)) || - (IsStrict && (TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == - TargetLowering::Expand || - TLI.getOperationAction(ISD::STRICT_FMUL, DstVT) == - TargetLowering::Promote))) { + if ((!IsStrict && !TLI.isOperationLegalOrCustom(ISD::FMUL, DstVT)) || + (IsStrict && !TLI.isOperationLegalOrCustom(ISD::STRICT_FMUL, DstVT))) { EVT FPVT = BW == 32 ? MVT::f32 : MVT::f64; SDValue UIToFP; SDValue Result; diff --git a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll b/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll deleted file mode 100644 index 76eecdd72f882..0000000000000 --- a/llvm/test/CodeGen/X86/test_UINT_TO_FP_no_inf_corei7_avx.ll +++ /dev/null @@ -1,133 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefix=AVX512 - -define <8 x half> @test_UINT_TO_FP_no_inf8(<8 x i32> %a) { -; AVX-LABEL: test_UINT_TO_FP_no_inf8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX512-LABEL: test_UINT_TO_FP_no_inf8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 -; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %vec = uitofp <8 x i32> %a to <8 x half> - ret <8 x half> %vec -} - -define <8 x half> @test_STRICT_UINT_TO_FP_no_inf8(<8 x i32> %a) { -; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; -; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 -; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") - ret <8 x half> %vec -} - -define <16 x half> @test_UINT_TO_FP_no_inf16(<16 x i32> %a) { -; AVX-LABEL: test_UINT_TO_FP_no_inf16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq -; -; AVX512-LABEL: test_UINT_TO_FP_no_inf16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512-NEXT: retq -entry: - %vec = uitofp <16 x i32> %a to <16 x half> - ret <16 x half> %vec -} - -define <16 x half> @test_STRICT_UINT_TO_FP_no_inf16(<16 x i32> %a) { -; AVX-LABEL: test_STRICT_UINT_TO_FP_no_inf16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpsrld $16, %xmm3, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2 -; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: retq -; -; AVX512-LABEL: test_STRICT_UINT_TO_FP_no_inf16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512-NEXT: retq -entry: - %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") - ret <16 x half> %vec -} diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll new file mode 100644 index 0000000000000..32339745ba75f --- /dev/null +++ b/llvm/test/CodeGen/X86/uint_to_half.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512 + +define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) { +; AVX1-LABEL: test_uitofp_v8i32_v8f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_uitofp_v8i32_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_uitofp_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = uitofp <8 x i32> %a to <8 x half> + ret <8 x half> %vec +} + +define <8 x half> @test_strict_uitofp_v8i32_v8f16(<8 x i32> %a) { +; AVX1-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_strict_uitofp_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %ymm0, %ymm0 +; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %vec = tail call <8 x half> @llvm.experimental.constrained.uitofp.f16.i32(<8 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <8 x half> %vec +} + +define <16 x half> @test_uitofp_v16i32_v16f16(<16 x i32> %a) { +; AVX1-LABEL: test_uitofp_v16i32_v16f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_uitofp_v16i32_v16f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_uitofp_v16i32_v16f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq + %vec = uitofp <16 x i32> %a to <16 x half> + ret <16 x half> %vec +} + +define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) { +; AVX1-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX1-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2 +; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7],ymm0[8],ymm4[9],ymm0[10],ymm4[11],ymm0[12],ymm4[13],ymm0[14],ymm4[15] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; AVX2-NEXT: vsubps %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7],ymm1[8],ymm4[9],ymm1[10],ymm4[11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-NEXT: vsubps %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_strict_uitofp_v16i32_v16f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtudq2ps %zmm0, %zmm0 +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512-NEXT: retq + %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <16 x half> %vec +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} From bf3b19224b9188c38c64d184af32fae0732817ae Mon Sep 17 00:00:00 2001 From: abhishek-kaushik22 Date: Tue, 7 Jan 2025 20:51:52 +0530 Subject: [PATCH 11/11] Update uint_to_half.ll --- llvm/test/CodeGen/X86/uint_to_half.ll | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/X86/uint_to_half.ll b/llvm/test/CodeGen/X86/uint_to_half.ll index 32339745ba75f..b62a07eec1ce6 100644 --- a/llvm/test/CodeGen/X86/uint_to_half.ll +++ b/llvm/test/CodeGen/X86/uint_to_half.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+f16c | FileCheck %s -check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+f16c | FileCheck %s -check-prefixes=AVX2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s -check-prefixes=AVX512 define <8 x half> @test_uitofp_v8i32_v8f16(<8 x i32> %a) { @@ -196,5 +196,3 @@ define <16 x half> @test_strict_uitofp_v16i32_v16f16(<16 x i32> %a) { %vec = tail call <16 x half> @llvm.experimental.constrained.uitofp.f16.i32(<16 x i32> %a, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <16 x half> %vec } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}}