diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a232815c..23cd5b5f8f7dd 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass { private: const X86InstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; const X86Subtarget *ST = nullptr; const MCSchedModel *SM = nullptr; }; @@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool { + if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(BlendOpc)); + MI.addOperand(MachineOperand::CreateImm(BlendImm)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return true; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -296,6 +309,24 @@ bool X86FixupInstTuningPass::processInstruction( // TODO: Add X86::VPBLENDWYrmi handling return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4); + case X86::VMOVSDZrr: + if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 || + TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 || + TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16) + return false; + [[fallthrough]]; + case X86::VMOVSDrr: + return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01); + + case X86::VMOVSSZrr: + if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 || + TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 || + TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16) + return false; + [[fallthrough]]; + case X86::VMOVSSrr: + return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01); + case X86::VPERMILPDri: return ProcessVPERMILPDri(X86::VSHUFPDrri); case X86::VPERMILPDYri: @@ -573,6 +604,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SM = &ST->getSchedModel(); for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 0ab94cca41425..d369a2d8e9f68 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)), multiclass avx512_move_scalar { - let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in + let Predicates = [prd] in { def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))], _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>; - let Predicates = [prd] in { def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", @@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask, VR128X:$src1, VR128X:$src2), 0>; -let Predicates = [HasAVX512, OptForSize] in { +let Predicates = [HasAVX512] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), @@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in { (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; } -// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than -// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31. -let Predicates = [HasAVX512, OptForSpeed] in { - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), - (i8 3))), sub_xmm)>; -} - let Predicates = [HasAVX512] in { def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), (VMOVSSZrm addr:$src)>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index abf365eedec39..d97f424f808e5 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const { case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return GetBlendDomains(8, false); + case X86::VMOVSSZrr: + // Only convert to BLEND if we are VEX compatible. + if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 || + RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 || + RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16) + return 0; + [[fallthrough]]; + case X86::MOVSSrr: + case X86::VMOVSSrr: + if (Subtarget.hasSSE41()) + return 0x2 | 0x8; // PackedSingle | PackedInt + return 0x2; // PackedSingle + case X86::VMOVSDZrr: + // Only convert to BLEND if we are VEX compatible. + if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 || + RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 || + RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16) + return 0; + [[fallthrough]]; + case X86::MOVSDrr: + case X86::VMOVSDrr: + if (Subtarget.hasSSE41()) + return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt + return 0x4; // PackedDouble case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: @@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI, case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return SetBlendDomain(16, true); + case X86::MOVSSrr: + case X86::VMOVSSrr: + case X86::VMOVSSZrr: + if (Domain == 3) { // PackedInt + MI.setDesc( + get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri)); + MI.addOperand(MachineOperand::CreateImm(0x03)); + if (Opcode == X86::VMOVSSZrr) + MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + return true; + } + return Domain == 1; // PackedSingle + case X86::MOVSDrr: + case X86::VMOVSDrr: + case X86::VMOVSDZrr: + if (Domain == 1) { // PackedSingle + MI.setDesc( + get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri)); + MI.addOperand(MachineOperand::CreateImm(0x03)); + if (Opcode == X86::VMOVSDZrr) + MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + return true; + } else if (Domain == 2) { // PackedDouble + return true; + } else if (Domain == 3) { // PackedInt + MI.setDesc( + get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri)); + MI.addOperand(MachineOperand::CreateImm(0x0F)); + if (Opcode == X86::VMOVSDZrr) + MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + return true; + } + return false; case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1acc0cd8da205..cbff8ffd4d761 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -209,10 +209,8 @@ multiclass sse12_move_rr { + X86MemOperand x86memop, string OpcodeStr, Domain d> { // AVX - let Predicates = [UseAVX, OptForSize] in defm V#NAME : sse12_move_rr, VEX, VVVV, VEX_LIG, WIG; @@ -223,7 +221,6 @@ multiclass sse12_move, WIG; // SSE1 & 2 let Constraints = "$src1 = $dst" in { - let Predicates = [pred, NoSSE41_Or_OptForSize] in defm NAME : sse12_move_rr; } @@ -268,9 +265,9 @@ multiclass sse12_move_rm, TB, XS; + SSEPackedSingle>, TB, XS; defm MOVSD : sse12_move, TB, XD; + SSEPackedDouble>, TB, XD; let canFoldAsLoad = 1, isReMaterializable = 1 in { defm MOVSS : sse12_move_rm; def : Pat<(v4f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; -} -let Predicates = [UseAVX, OptForSize] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), @@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in { (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; } -let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { -// Move scalar to XMM zero-extended, zeroing a VR128 then do a -// MOVSS to the lower bits. -def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; -def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; -} - let Predicates = [UseSSE2] in def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), (MOVSDrm addr:$src)>; -let Predicates = [UseSSE1] in -def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), - (MOVSSrm addr:$src)>; +let Predicates = [UseSSE1] in { + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (MOVSSrm addr:$src)>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; +} //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions @@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in { (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -// Prefer a movss or movsd over a blendps when optimizing for size. these were -// changed to use blends because blends have better throughput on sandybridge -// and haswell, but movs[s/d] are 1-2 byte shorter instructions. -let Predicates = [HasAVX, OptForSpeed] in { - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; - - def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; +// TODO: Remove these and let foldMemoryOperandCustom handle it? +let Predicates = [HasAVX] in { def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; - def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), - (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), - (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), - (i8 3))), sub_xmm)>; } -// Prefer a movss or movsd over a blendps when optimizing for size. these were -// changed to use blends because blends have better throughput on sandybridge -// and haswell, but movs[s/d] are 1-2 byte shorter instructions. -let Predicates = [UseSSE41, OptForSpeed] in { - // With SSE41 we can use blends for these patterns. - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; - - def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; +let Predicates = [UseSSE41] in { def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; - def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index f0203b3b889e4..87ea43f87b2f2 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { -; CHECK-LABEL: test_x86_sse41_blendpd: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0] -; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX-LABEL: test_x86_sse41_blendpd: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0] +; AVX-NEXT: # xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_sse41_blendpd: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0] +; AVX512VL-NEXT: # xmm0 = xmm0[0],xmm1[1] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1] ret <2 x double> %res } diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll index a2af7df44010e..361dccf741aee 100644 --- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll @@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind { ; NOAVX512MOVZXC-LABEL: test_mm_move_epi32: ; NOAVX512MOVZXC: # %bb.0: ; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] -; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0] +; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0] ; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] ; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3] %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 3aa77c3955c63..1608c421ed548 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] ; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 ; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax ; AVX512VLVNNI-NEXT: addl %edx, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index 456e6e8f263aa..c32d674a84435 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax @@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax +; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm2, %xmm1 +; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax ; AVX512VLVNNI-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 2e2e78a6da51e..1fca9b78352ec 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -187,12 +187,19 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpge_ss: -; AVX: # %bb.0: -; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02] -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpge_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02] +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpge_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02] +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> ret <4 x float> %res @@ -229,12 +236,19 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpgt_ss: -; AVX: # %bb.0: -; AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01] -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpgt_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01] +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpgt_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01] +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> ret <4 x float> %res @@ -379,12 +393,19 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpnge_ss: -; AVX: # %bb.0: -; AVX-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06] -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpnge_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06] +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpnge_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06] +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> ret <4 x float> %res @@ -421,12 +442,19 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpngt_ss: -; AVX: # %bb.0: -; AVX-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05] -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpngt_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05] +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpngt_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05] +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5) %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> ret <4 x float> %res @@ -1601,11 +1629,17 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) { ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_move_ss: -; AVX: # %bb.0: -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_move_ss: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_move_ss: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> ret <4 x float> %res } @@ -2227,8 +2261,8 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind { ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] -; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] -; X86-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0] +; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] +; X86-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0] ; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; @@ -2240,12 +2274,19 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind { ; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; -; X64-AVX-LABEL: test_mm_set_ss: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] -; X64-AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0] -; X64-AVX-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] -; X64-AVX-NEXT: retq # encoding: [0xc3] +; X64-AVX1-LABEL: test_mm_set_ss: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9] +; X64-AVX1-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0] +; X64-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX1-NEXT: retq # encoding: [0xc3] +; +; X64-AVX512-LABEL: test_mm_set_ss: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] +; X64-AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0] +; X64-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3] +; X64-AVX512-NEXT: retq # encoding: [0xc3] %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 3f48b22e2b9ff..79adbb5a54248 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -628,12 +628,19 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpge_sd: -; AVX: # %bb.0: -; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02] -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpge_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02] +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpge_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02] +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2) %ext0 = extractelement <2 x double> %cmp, i32 0 %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 @@ -745,12 +752,19 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpgt_sd: -; AVX: # %bb.0: -; AVX-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01] -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpgt_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01] +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpgt_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01] +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1) %ext0 = extractelement <2 x double> %cmp, i32 0 %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 @@ -973,12 +987,19 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpnge_sd: -; AVX: # %bb.0: -; AVX-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06] -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpnge_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06] +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpnge_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06] +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6) %ext0 = extractelement <2 x double> %cmp, i32 0 %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 @@ -1018,12 +1039,19 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_cmpngt_sd: -; AVX: # %bb.0: -; AVX-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05] -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_cmpngt_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05] +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_cmpngt_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05] +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5) %ext0 = extractelement <2 x double> %cmp, i32 0 %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 @@ -3008,11 +3036,17 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin ; SSE-NEXT: # xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; -; AVX-LABEL: test_mm_move_sd: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] -; AVX-NEXT: # xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; AVX1-LABEL: test_mm_move_sd: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1] +; AVX1-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512-LABEL: test_mm_move_sd: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] +; AVX512-NEXT: # xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %ext0 = extractelement <2 x double> %a1, i32 0 %res0 = insertelement <2 x double> undef, double %ext0, i32 0 %ext1 = extractelement <2 x double> %a0, i32 1 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 413b4e79257a0..423e298b11faa 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -734,7 +734,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) { ; X86-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08] ; X86-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9] -; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] +; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] ; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -761,7 +761,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) { ; X64-AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f] ; X64-AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9] -; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] +; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] ; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %a1 = load <4 x float>, ptr %p1 diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll index 137606b7cfeed..0d360ba7c005e 100644 --- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -16,11 +16,17 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) ; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX-LABEL: test_x86_sse41_blendpd: -; AVX: ## %bb.0: -; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0] -; AVX-NEXT: ## xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX1-LABEL: test_x86_sse41_blendpd: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0] +; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse41_blendpd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0] +; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1] ret <2 x double> %res } diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 4f5b7ee0eaea0..3c6d220bc0ffa 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X64-AVX512 @g16 = external global i16 @@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero ; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] -; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] +; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] ; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -371,11 +371,17 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind ; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; -; X64-AVX-LABEL: blendps_not_insertps_1: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] -; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] -; X64-AVX-NEXT: retq ## encoding: [0xc3] +; X64-AVX1-LABEL: blendps_not_insertps_1: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] +; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: blendps_not_insertps_1: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 ret <4 x float> %tmp1 } @@ -438,11 +444,17 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou ; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; AVX-LABEL: blendps_not_insertps_2: -; AVX: ## %bb.0: -; AVX-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] -; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; AVX1-LABEL: blendps_not_insertps_2: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] +; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: blendps_not_insertps_2: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] +; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %tmp2 = extractelement <4 x float> %t2, i32 0 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 ret <4 x float> %tmp1 @@ -1217,8 +1229,8 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; ; AVX512-LABEL: i32_shuf_X00A: ; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] -; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0] +; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2] +; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x10,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] ; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll index e73d345d0fcd4..23cf271c2bb8f 100644 --- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefixes=X64 -; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX,X86_AVX1 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX,X64_AVX1 -; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX,X86_AVX512 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX,X64_AVX512 +; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X86_AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefixes=X64_AVX +; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X86_AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefixes=X64_AVX define i16 @test1(float %f) nounwind { ; X86-LABEL: test1: @@ -32,57 +32,30 @@ define i16 @test1(float %f) nounwind { ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; -; X86_AVX1-LABEL: test1: -; X86_AVX1: ## %bb.0: -; X86_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X86_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X86_AVX1-NEXT: vcvttss2si %xmm0, %eax -; X86_AVX1-NEXT: ## kill: def $ax killed $ax killed $eax -; X86_AVX1-NEXT: retl -; -; X64_AVX1-LABEL: test1: -; X64_AVX1: ## %bb.0: -; X64_AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64_AVX1-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64_AVX1-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax -; X64_AVX1-NEXT: ## kill: def $ax killed $ax killed $eax -; X64_AVX1-NEXT: retq -; -; X86_AVX512-LABEL: test1: -; X86_AVX512: ## %bb.0: -; X86_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X86_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X86_AVX512-NEXT: vcvttss2si %xmm0, %eax -; X86_AVX512-NEXT: ## kill: def $ax killed $ax killed $eax -; X86_AVX512-NEXT: retl +; X86_AVX-LABEL: test1: +; X86_AVX: ## %bb.0: +; X86_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86_AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86_AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86_AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X86_AVX-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X86_AVX-NEXT: vcvttss2si %xmm0, %eax +; X86_AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X86_AVX-NEXT: retl ; -; X64_AVX512-LABEL: test1: -; X64_AVX512: ## %bb.0: -; X64_AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64_AVX512-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X64_AVX512-NEXT: vcvttss2si %xmm0, %eax -; X64_AVX512-NEXT: ## kill: def $ax killed $ax killed $eax -; X64_AVX512-NEXT: retq +; X64_AVX-LABEL: test1: +; X64_AVX: ## %bb.0: +; X64_AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64_AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64_AVX-NEXT: vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X64_AVX-NEXT: vcvttss2si %xmm0, %eax +; X64_AVX-NEXT: ## kill: def $ax killed $ax killed $eax +; X64_AVX-NEXT: retq %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 01159d4135d8e..00b60893fc783 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3272,7 +3272,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq @@ -3404,7 +3404,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq @@ -4107,9 +4107,9 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: vmovaps %xmm0, (%rbx) ; AVX512-NEXT: addq $64, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 983ae594e3ab1..73537b4c0db76 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -113,7 +113,6 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) { ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll index d99b200385585..890246467ef86 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll @@ -240,7 +240,6 @@ define i32 @test_v4i32(<4 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll index aed4e023e340c..437df521c3117 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -1047,7 +1047,6 @@ define i8 @test_v4i8(<4 x i8> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax