diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dba38f3e1a0bc..dd682e061e646 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58574,6 +58574,7 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); @@ -58641,6 +58642,16 @@ static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp); } + if (VT == MVT::v4i32) { + SDValue HalfSrc; + // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16)))))) + // to remove XMM->GPR->XMM moves. + if (sd_match(Src, m_AnyExt(m_BitCast( + m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc)))))) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc)); + } + // See if we're broadcasting the scalar value, in which case just reuse that. // Ensure the same SDValue from the SDNode use is being used. if (VT.getScalarType() == Src.getValueType()) diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index a6b3e3fd1fd16..d67cd6b62c2b9 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -708,10 +708,8 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) { ; ; BF16-LABEL: pr62997: ; BF16: # %bb.0: -; BF16-NEXT: vpextrw $0, %xmm0, %eax -; BF16-NEXT: vpextrw $0, %xmm1, %ecx -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; BF16-NEXT: vpextrw $0, %xmm1, %eax +; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; BF16-NEXT: retq ; ; FP16-LABEL: pr62997: @@ -1652,66 +1650,63 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; AVXNC-NEXT: pushq %r12 ; AVXNC-NEXT: pushq %rbx ; AVXNC-NEXT: subq $168, %rsp -; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVXNC-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVXNC-NEXT: # xmm0 = mem[1,0] +; AVXNC-NEXT: callq __truncdfbf2@PLT +; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[1,0] -; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vpextrw $0, %xmm0, %eax -; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx +; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d -; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx -; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vpextrw $0, %xmm0, %eax -; AVXNC-NEXT: vmovd %ebx, %xmm0 -; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpinsrw $1, %r13d, %xmm0, %xmm0 +; AVXNC-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 -; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVXNC-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 ; AVXNC-NEXT: addq $168, %rsp ; AVXNC-NEXT: popq %rbx ; AVXNC-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll index fdf0bf3f692d6..e911a24d830f7 100644 --- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll +++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll @@ -133,11 +133,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { ; ; AVX512-LABEL: complex_canonicalize_fmul_half: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpextrw $0, %xmm1, %eax -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll index c7ef353f7f603..efc457e35e7f3 100644 --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -59,8 +59,7 @@ define float @test2(ptr nocapture %src) nounwind { ; ; F16C-LABEL: test2: ; F16C: # %bb.0: -; F16C-NEXT: movzwl (%rdi), %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: retq ; @@ -119,8 +118,7 @@ define double @test4(ptr nocapture %src) nounwind { ; ; F16C-LABEL: test4: ; F16C: # %bb.0: -; F16C-NEXT: movzwl (%rdi), %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; F16C-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll index 8037c783dd8e6..7d1c52cd65451 100644 --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -50,8 +50,6 @@ define half @roundeven_f16(half %h) { ; ; AVX512F-LABEL: roundeven_f16: ; AVX512F: ## %bb.0: ## %entry -; AVX512F-NEXT: vpextrw $0, %xmm0, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll index 1515cd1366bc6..0d8290b120fa4 100644 --- a/llvm/test/CodeGen/X86/fp16-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll @@ -9,8 +9,6 @@ define void @test_half_ceil(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_ceil: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -107,8 +105,6 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq cosf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -168,8 +164,6 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq expf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -229,8 +223,6 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq exp2f@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -290,8 +282,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq exp10f@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -349,8 +339,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind { define void @test_half_fabs(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fabs: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -392,8 +380,6 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind { define void @test_half_floor(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_floor: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -447,14 +433,8 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm2, %eax -; F16C-NEXT: vpextrw $0, %xmm1, %ecx -; F16C-NEXT: vpextrw $0, %xmm0, %edx -; F16C-NEXT: vmovd %edx, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vmovd %ecx, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vmovd %eax, %xmm2 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 ; F16C-NEXT: callq fmaf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -542,8 +522,6 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind { define void @test_half_fneg(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fneg: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -587,8 +565,6 @@ define void @test_half_log(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq logf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -648,8 +624,6 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq log2f@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -709,8 +683,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq log10f@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -768,8 +740,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind { define void @test_half_nearbyint(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_nearbyint: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -823,11 +793,7 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm1, %eax -; F16C-NEXT: vpextrw $0, %xmm0, %ecx -; F16C-NEXT: vmovd %ecx, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; F16C-NEXT: callq powf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -907,8 +873,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rsi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq __powisf2@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -976,8 +940,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind { define void @test_half_rint(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_rint: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1031,8 +993,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq sinf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1090,8 +1050,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind { define void @test_half_sqrt(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_sqrt: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1146,8 +1104,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind { ; F16C: # %bb.0: ; F16C-NEXT: pushq %rbx ; F16C-NEXT: movq %rdi, %rbx -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: callq tanf@PLT ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1205,8 +1161,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind { define void @test_half_trunc(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_trunc: ; F16C: # %bb.0: -; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll index 1d2f4eb39bbe6..ec099db4e7ca7 100644 --- a/llvm/test/CodeGen/X86/half-darwin.ll +++ b/llvm/test/CodeGen/X86/half-darwin.ll @@ -76,8 +76,7 @@ define float @extendhfsf(ptr %ptr) nounwind { ; ; CHECK-F16C-LABEL: extendhfsf: ; CHECK-F16C: ## %bb.0: -; CHECK-F16C-NEXT: movzwl (%rdi), %eax -; CHECK-F16C-NEXT: vmovd %eax, %xmm0 +; CHECK-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-F16C-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/half-fp80-darwin.ll b/llvm/test/CodeGen/X86/half-fp80-darwin.ll index 0ba734e66c7b2..65a26187c5857 100644 --- a/llvm/test/CodeGen/X86/half-fp80-darwin.ll +++ b/llvm/test/CodeGen/X86/half-fp80-darwin.ll @@ -19,8 +19,7 @@ define void @extendhfxf(ptr %outptr, ptr %inptr) nounwind { ; ; CHECK-F16C-LABEL: extendhfxf: ; CHECK-F16C: ## %bb.0: -; CHECK-F16C-NEXT: movzwl (%rsi), %eax -; CHECK-F16C-NEXT: vmovd %eax, %xmm0 +; CHECK-F16C-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm0 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-F16C-NEXT: flds -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 033cadae6a1e7..7bac075e48680 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -81,8 +81,7 @@ define float @test_extend32(ptr %addr) #0 { ; ; BWON-F16C-LABEL: test_extend32: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: retq ; @@ -113,8 +112,7 @@ define double @test_extend64(ptr %addr) #0 { ; ; BWON-F16C-LABEL: test_extend64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: retq @@ -220,8 +218,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; ; BWON-F16C-LABEL: test_fptosi_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax ; BWON-F16C-NEXT: retq @@ -312,8 +309,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; ; BWON-F16C-LABEL: test_fptoui_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 +; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvttss2si %xmm0, %rcx ; BWON-F16C-NEXT: movq %rcx, %rdx @@ -851,13 +847,12 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; ; BWON-F16C-LABEL: test_sitofp_fadd_i32: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: movzwl (%rsi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm1 +; BWON-F16C-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: retq @@ -916,8 +911,6 @@ define half @PR40273(half) #0 { ; ; BWON-F16C-LABEL: PR40273: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: xorl %eax, %eax ; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 @@ -969,8 +962,6 @@ define void @brcond(half %0) #0 { ; ; BWON-F16C-LABEL: brcond: ; BWON-F16C: # %bb.0: # %entry -; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 @@ -1024,8 +1015,6 @@ define half @test_sqrt(half %0) #0 { ; ; BWON-F16C-LABEL: test_sqrt: ; BWON-F16C: # %bb.0: # %entry -; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; BWON-F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -1136,9 +1125,7 @@ define void @main.45() #0 { ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: pinsrw $0, (%rax), %xmm0 -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax -; CHECK-LIBCALL-NEXT: movd %eax, %xmm1 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] ; CHECK-LIBCALL-NEXT: movq %xmm1, %rbx ; CHECK-LIBCALL-NEXT: movq %rbx, %r14 ; CHECK-LIBCALL-NEXT: shrq $48, %r14 @@ -1167,15 +1154,12 @@ define void @main.45() #0 { ; BWON-F16C-LABEL: main.45: ; BWON-F16C: # %bb.0: # %entry ; BWON-F16C-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax ; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; BWON-F16C-NEXT: vmovd %eax, %xmm1 -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm1 ; BWON-F16C-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; BWON-F16C-NEXT: vcmpunordps %xmm2, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; BWON-F16C-NEXT: vcmpunordps %xmm2, %xmm1, %xmm1 +; BWON-F16C-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; BWON-F16C-NEXT: vpblendvb %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovq %xmm0, (%rax) ; BWON-F16C-NEXT: retq ; @@ -1185,12 +1169,11 @@ define void @main.45() #0 { ; CHECK-I686-NEXT: pushl %esi ; CHECK-I686-NEXT: subl $20, %esp ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movd %eax, %xmm0 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-I686-NEXT: movd %xmm0, %esi +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; CHECK-I686-NEXT: movd %xmm1, %esi ; CHECK-I686-NEXT: movl %esi, %edi ; CHECK-I686-NEXT: shrl $16, %edi +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __extendhfsf2 ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) @@ -1336,13 +1319,9 @@ define half @pr61271(half %0, half %1) #0 { ; ; BWON-F16C-LABEL: pr61271: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpextrw $0, %xmm0, %eax -; BWON-F16C-NEXT: vpextrw $0, %xmm1, %ecx -; BWON-F16C-NEXT: vmovd %ecx, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vminss %xmm0, %xmm1, %xmm0 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vminss %xmm1, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovd %xmm0, %eax ; BWON-F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr114520.ll b/llvm/test/CodeGen/X86/pr114520.ll index c557da6b3ab8c..9bd1f49ff67c9 100644 --- a/llvm/test/CodeGen/X86/pr114520.ll +++ b/llvm/test/CodeGen/X86/pr114520.ll @@ -5,7 +5,6 @@ define half @test1(half %x) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movl $64512, %ecx # imm = 0xFC00 diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll index ce37622c476db..1c9c8e40c009d 100644 --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -51,17 +51,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; ; F16C-O0-LABEL: ir_fadd_v1f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: vpextrw $0, %xmm1, %eax -; F16C-O0-NEXT: movw %ax, %cx -; F16C-O0-NEXT: # implicit-def: $eax -; F16C-O0-NEXT: movw %cx, %ax -; F16C-O0-NEXT: vmovd %eax, %xmm1 ; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-O0-NEXT: vpextrw $0, %xmm0, %eax -; F16C-O0-NEXT: movw %ax, %cx -; F16C-O0-NEXT: # implicit-def: $eax -; F16C-O0-NEXT: movw %cx, %ax -; F16C-O0-NEXT: vmovd %eax, %xmm0 ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr86305.ll b/llvm/test/CodeGen/X86/pr86305.ll index 79b42bb2532ca..0d2e1abe8e5fc 100644 --- a/llvm/test/CodeGen/X86/pr86305.ll +++ b/llvm/test/CodeGen/X86/pr86305.ll @@ -28,17 +28,16 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: fptrunc_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] ; CHECK-NEXT: callq __truncsfbf2@PLT -; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-NEXT: callq __truncsfbf2@PLT @@ -49,24 +48,21 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind { ; CHECK-NEXT: vpextrw $0, %xmm0, %ebp ; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpextrw $0, %xmm0, %r14d -; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpextrw $0, %xmm0, %r15d -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] ; CHECK-NEXT: callq __truncsfbf2@PLT ; CHECK-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-NEXT: vmovd %r15d, %xmm0 -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrw $2, %r14d, %xmm0, %xmm0 +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vpinsrw $1, %r14d, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $3, %ebp, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $4, %ebx, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $5, %ebx, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $6, %ebx, %xmm0, %xmm0 ; CHECK-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0 -; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: addq $64, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %b = fptrunc <4 x float> %a to <4 x bfloat> diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 54acd012d1fe4..ec916060563a7 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -814,15 +814,13 @@ define float @load_cvt_i16_to_f32(ptr %a0) nounwind { ; ; F16C-LABEL: load_cvt_i16_to_f32: ; F16C: # %bb.0: -; F16C-NEXT: movzwl (%rdi), %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: retq ; ; AVX512-LABEL: load_cvt_i16_to_f32: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = load i16, ptr %a0 @@ -1830,16 +1828,14 @@ define double @load_cvt_i16_to_f64(ptr %a0) nounwind { ; ; F16C-LABEL: load_cvt_i16_to_f64: ; F16C: # %bb.0: -; F16C-NEXT: movzwl (%rdi), %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; F16C-NEXT: retq ; ; AVX512-LABEL: load_cvt_i16_to_f64: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %eax -; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq