diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index de29a76882148..27d9388f379ec 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7395,6 +7395,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; + case X86::VEXTRACTF128rri: + case X86::VEXTRACTI128rri: + // Replaces subvector extraction with a load. + // TODO: Add AVX512 variants. + if (OpNum == 1) { + unsigned Idx = MI.getOperand(MI.getNumOperands() - 1).getImm(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), 0, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + assert((RCSize == 16) && "Unexpected dst register size"); + int PtrOffset = Idx * RCSize; + + unsigned NewOpCode; + switch (MI.getOpcode()) { + case X86::VEXTRACTF128rri: + NewOpCode = Alignment < Align(RCSize) ? X86::VMOVUPSrm : X86::VMOVAPSrm; + break; + case X86::VEXTRACTI128rri: + NewOpCode = Alignment < Align(RCSize) ? X86::VMOVDQUrm : X86::VMOVDQArm; + break; + default: + llvm_unreachable("Unexpected EXTRACT_SUBVECTOR instruction"); + } + + MachineInstr *NewMI = + fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->removeOperand(NewMI->getNumOperands() - 1); + return NewMI; + } + break; case X86::MOV32r0: if (auto *NewMI = makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs, diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 684e2921b789e..817a2a1ec0d87 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1743,11 +1743,9 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVXNC-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload @@ -1759,10 +1757,8 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVXNC-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT ; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index f26960b069b0e..f776190e9cd16 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1221,23 +1221,20 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x70,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0x90,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03] -; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill @@ -1981,21 +1978,18 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfb,0x10,0x45,0x20] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0x10,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 32-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xf0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[1],xmm1[1] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 67c9e7cc22236..9c5b9e39b9864 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -290,11 +290,9 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { ; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3] ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index 959265d08299a..c117cb0787460 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -130,14 +130,10 @@ define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -195,14 +191,10 @@ define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -280,14 +272,10 @@ define void @frem_v8f32(<8 x float> %a0, <8 x float> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -389,14 +377,10 @@ define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -422,14 +406,10 @@ define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -475,14 +455,10 @@ define void @frem_v4f64(<4 x double> %a0, <4 x double> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -545,11 +521,9 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovd %xmm0, (%rsp) # 4-byte Folded Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero @@ -773,18 +747,15 @@ define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, ptr%p3) nounwind { ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero @@ -1033,11 +1004,9 @@ define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, ptr%p3) nounwind { ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll index b107b1c2749cc..80704fd297c8b 100644 --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -202,8 +202,7 @@ define <5 x float> @sin_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -274,8 +273,7 @@ define <3 x double> @sin_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -396,8 +394,7 @@ define <5 x float> @tan_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq tanf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -468,8 +465,7 @@ define <3 x double> @tan_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq tan@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -590,8 +586,7 @@ define <5 x float> @acos_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq acosf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -662,8 +657,7 @@ define <3 x double> @acos_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq acos@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -784,8 +778,7 @@ define <5 x float> @asin_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq asinf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -856,8 +849,7 @@ define <3 x double> @asin_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq asin@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -978,8 +970,7 @@ define <5 x float> @atan_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq atanf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1050,8 +1041,7 @@ define <3 x double> @atan_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq atan@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1172,8 +1162,7 @@ define <5 x float> @cosh_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq coshf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1244,8 +1233,7 @@ define <3 x double> @cosh_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq cosh@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1366,8 +1354,7 @@ define <5 x float> @sinh_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinhf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1438,8 +1425,7 @@ define <3 x double> @sinh_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinh@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1560,8 +1546,7 @@ define <5 x float> @tanh_v5f32(<5 x float> %x) nounwind { ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq tanhf@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload @@ -1632,8 +1617,7 @@ define <3 x double> @tanh_v3f64(<3 x double> %x) nounwind { ; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq tanh@PLT ; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 8510b1031d717..9880d20e1d80d 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -199,211 +199,107 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { } define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_16i16_to_16f32: -; AVX1: # %bb.0: -; AVX1-NEXT: subq $104, %rsp -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[2,3,0,1] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[2,3,0,1] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: addq $104, %rsp -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_16i16_to_16f32: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $104, %rsp -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[2,3,0,1] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,1,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[2,3,0,1] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,1,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: addq $104, %rsp -; AVX2-NEXT: retq +; AVX-LABEL: cvt_16i16_to_16f32: +; AVX: # %bb.0: +; AVX-NEXT: subq $104, %rsp +; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,0,1] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,0,1] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: addq $104, %rsp +; AVX-NEXT: retq ; ; F16C-LABEL: cvt_16i16_to_16f32: ; F16C: # %bb.0: @@ -571,211 +467,107 @@ define <8 x float> @cvt_8i16_to_8f32_constrained(<8 x i16> %a0) nounwind strictf declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp { -; AVX1-LABEL: cvt_16i16_to_16f32_constrained: -; AVX1: # %bb.0: -; AVX1-NEXT: subq $104, %rsp -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[2,3,0,1] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[2,3,0,1] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[1,1,3,3] -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: addq $104, %rsp -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_16i16_to_16f32_constrained: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $104, %rsp -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[2,3,0,1] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,1,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[2,3,0,1] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[1,1,3,3] -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-NEXT: callq __extendhfsf2@PLT -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: addq $104, %rsp -; AVX2-NEXT: retq +; AVX-LABEL: cvt_16i16_to_16f32_constrained: +; AVX: # %bb.0: +; AVX-NEXT: subq $104, %rsp +; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,0,1] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[2,3,0,1] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilps $245, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,1,3,3] +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: callq __extendhfsf2@PLT +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: addq $104, %rsp +; AVX-NEXT: retq ; ; F16C-LABEL: cvt_16i16_to_16f32_constrained: ; F16C: # %bb.0: @@ -2388,11 +2180,9 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2468,11 +2258,9 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2868,11 +2656,9 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2953,11 +2739,9 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3150,67 +2934,34 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { } define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { -; AVX1-LABEL: cvt_4f64_to_4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: subq $88, %rsp -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX1-NEXT: addq $88, %rsp -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f64_to_4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $88, %rsp -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX2-NEXT: addq $88, %rsp -; AVX2-NEXT: retq +; AVX-LABEL: cvt_4f64_to_4i16: +; AVX: # %bb.0: +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: retq ; ; F16C-LABEL: cvt_4f64_to_4i16: ; F16C: # %bb.0: @@ -3282,67 +3033,34 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { } define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { -; AVX1-LABEL: cvt_4f64_to_8i16_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: subq $88, %rsp -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX1-NEXT: addq $88, %rsp -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f64_to_8i16_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $88, %rsp -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX2-NEXT: addq $88, %rsp -; AVX2-NEXT: retq +; AVX-LABEL: cvt_4f64_to_8i16_undef: +; AVX: # %bb.0: +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: retq ; ; F16C-LABEL: cvt_4f64_to_8i16_undef: ; F16C: # %bb.0: @@ -3415,67 +3133,34 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { } define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { -; AVX1-LABEL: cvt_4f64_to_8i16_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: subq $88, %rsp -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX1-NEXT: addq $88, %rsp -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f64_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: subq $88, %rsp -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX2-NEXT: addq $88, %rsp -; AVX2-NEXT: retq +; AVX-LABEL: cvt_4f64_to_8i16_zero: +; AVX: # %bb.0: +; AVX-NEXT: subq $88, %rsp +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX-NEXT: addq $88, %rsp +; AVX-NEXT: retq ; ; F16C-LABEL: cvt_4f64_to_8i16_zero: ; F16C: # %bb.0: @@ -3572,10 +3257,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncdfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload @@ -3630,10 +3313,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 +; F16C-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; F16C-NEXT: vzeroupper ; F16C-NEXT: callq __truncdfhf2@PLT ; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload @@ -3659,63 +3340,119 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; F16C-NEXT: addq $104, %rsp ; F16C-NEXT: retq ; -; AVX512-LABEL: cvt_8f64_to_8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: subq $120, %rsp -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: addq $120, %rsp -; AVX512-NEXT: retq +; AVX512F-LABEL: cvt_8f64_to_8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $120, %rsp +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512F-NEXT: addq $120, %rsp +; AVX512F-NEXT: retq +; +; AVX512-FASTLANE-LABEL: cvt_8f64_to_8i16: +; AVX512-FASTLANE: # %bb.0: +; AVX512-FASTLANE-NEXT: subq $120, %rsp +; AVX512-FASTLANE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FASTLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-FASTLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-FASTLANE-NEXT: addq $120, %rsp +; AVX512-FASTLANE-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> ret <8 x i16> %2 @@ -3827,75 +3564,38 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind { } define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_4f64_to_4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $80, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpextrw $0, %xmm0, 4(%rbx) -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpextrw $0, %xmm0, 6(%rbx) -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) -; AVX1-NEXT: addq $80, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f64_to_4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $80, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx) -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpextrw $0, %xmm0, 6(%rbx) -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) -; AVX2-NEXT: addq $80, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_4f64_to_4i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $80, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX-NEXT: addq $80, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq ; ; F16C-LABEL: store_cvt_4f64_to_4i16: ; F16C: # %bb.0: @@ -3971,75 +3671,38 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { } define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $80, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX1-NEXT: vmovaps %xmm0, (%rbx) -; AVX1-NEXT: addq $80, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $80, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX2-NEXT: vmovaps %xmm0, (%rbx) -; AVX2-NEXT: addq $80, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $80, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX-NEXT: vmovaps %xmm0, (%rbx) +; AVX-NEXT: addq $80, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq ; ; F16C-LABEL: store_cvt_4f64_to_8i16_undef: ; F16C: # %bb.0: @@ -4121,75 +3784,38 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { } define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $80, %rsp -; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX1-NEXT: vmovaps %xmm0, (%rbx) -; AVX1-NEXT: addq $80, %rsp -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $80, %rsp -; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX2-NEXT: vmovaps %xmm0, (%rbx) -; AVX2-NEXT: addq $80, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $80, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX-NEXT: vmovaps %xmm0, (%rbx) +; AVX-NEXT: addq $80, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq ; ; F16C-LABEL: store_cvt_4f64_to_8i16_zero: ; F16C: # %bb.0: @@ -4297,10 +3923,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncdfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload @@ -4359,10 +3983,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { ; F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 +; F16C-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; F16C-NEXT: vzeroupper ; F16C-NEXT: callq __truncdfhf2@PLT ; F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload @@ -4390,67 +4012,127 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { ; F16C-NEXT: popq %rbx ; F16C-NEXT: retq ; -; AVX512-LABEL: store_cvt_8f64_to_8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $112, %rsp -; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = mem[1,0] -; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovdqa %xmm0, (%rbx) -; AVX512-NEXT: addq $112, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: retq +; AVX512F-LABEL: store_cvt_8f64_to_8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: subq $112, %rsp +; AVX512F-NEXT: movq %rdi, %rbx +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = mem[1,0] +; AVX512F-NEXT: callq __truncdfhf2@PLT +; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512F-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512F-NEXT: addq $112, %rsp +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: retq +; +; AVX512-FASTLANE-LABEL: store_cvt_8f64_to_8i16: +; AVX512-FASTLANE: # %bb.0: +; AVX512-FASTLANE-NEXT: pushq %rbx +; AVX512-FASTLANE-NEXT: subq $112, %rsp +; AVX512-FASTLANE-NEXT: movq %rdi, %rbx +; AVX512-FASTLANE-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FASTLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FASTLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vzeroupper +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FASTLANE-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = mem[1,0] +; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT +; AVX512-FASTLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FASTLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-FASTLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-FASTLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-FASTLANE-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-FASTLANE-NEXT: addq $112, %rsp +; AVX512-FASTLANE-NEXT: popq %rbx +; AVX512-FASTLANE-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> store <8 x i16> %2, ptr %a1 @@ -4515,11 +4197,9 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4565,8 +4245,7 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper @@ -4614,11 +4293,9 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4703,11 +4380,9 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4778,8 +4453,7 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper @@ -4802,11 +4476,9 @@ define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 13410fb5cc4b8..0b5731f530960 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -5936,8 +5936,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -6557,8 +6556,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -7178,8 +7176,7 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -12495,8 +12492,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -12509,8 +12505,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -12522,8 +12517,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm12 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -12535,8 +12529,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm4 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -12548,8 +12541,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss 1172(%rdi), %ymm3 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -12562,8 +12554,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -12577,8 +12568,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] ; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -12590,8 +12580,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vbroadcastss 1940(%rdi), %ymm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -12749,8 +12738,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -12762,8 +12750,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -12775,8 +12762,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -13847,8 +13833,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -13861,8 +13846,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -13874,8 +13858,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm12 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -13887,8 +13870,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm4 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -13900,8 +13882,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss 1172(%rdi), %ymm3 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -13914,8 +13895,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -13929,8 +13909,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -13942,8 +13921,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vbroadcastss 1940(%rdi), %ymm0 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -14101,8 +14079,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -14114,8 +14091,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -14127,8 +14103,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -15199,8 +15174,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm14 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -15213,8 +15187,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -15226,8 +15199,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm12 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -15239,8 +15211,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm4 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -15252,8 +15223,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss 1172(%rdi), %ymm3 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -15266,8 +15236,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Reload ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -15281,8 +15250,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -15294,8 +15262,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vbroadcastss 1940(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -15453,8 +15420,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -15466,8 +15432,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -15479,8 +15444,7 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 32-byte Reload ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15