Skip to content

Commit 3fc9f7b

Browse files
committed
[X86] Remove MOVSS/D -> BLENDPS/D conversions from DAG/ISEL
This patch attempts to remove most of the MOVSS/D vs BLENDPS/D OptForSize/OptForSpeed instruction selection as possible and lets later domain switching and X86FixupInstTuning passes handle it. (V)MOVSS/D instructions are created in all cases, which also avoids AVX512 getting stuck with VBLENDPS/D VEC inctructions restricting register usage. getExecutionDomainCustom can now convert MOVSS/D to BLENDW/PS to support domain switches and X86FixupInstTuning can convert VMOVSS/D back to VBLENDPS/D if the scheduler model prefers it (and not building for OptSize). Fixes #142972
1 parent 329dfa1 commit 3fc9f7b

18 files changed

+335
-238
lines changed

llvm/lib/Target/X86/X86FixupInstTuning.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {
5454

5555
private:
5656
const X86InstrInfo *TII = nullptr;
57+
const X86RegisterInfo *TRI = nullptr;
5758
const X86Subtarget *ST = nullptr;
5859
const MCSchedModel *SM = nullptr;
5960
};
@@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction(
277278
return true;
278279
};
279280

281+
auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool {
282+
if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false))
283+
return false;
284+
LLVM_DEBUG(dbgs() << "Replacing: " << MI);
285+
{
286+
MI.setDesc(TII->get(BlendOpc));
287+
MI.addOperand(MachineOperand::CreateImm(BlendImm));
288+
}
289+
LLVM_DEBUG(dbgs() << " With: " << MI);
290+
return true;
291+
};
292+
280293
switch (Opc) {
281294
case X86::BLENDPDrri:
282295
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -296,6 +309,24 @@ bool X86FixupInstTuningPass::processInstruction(
296309
// TODO: Add X86::VPBLENDWYrmi handling
297310
return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
298311

312+
case X86::VMOVSDZrr:
313+
if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
314+
TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
315+
TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
316+
return false;
317+
[[fallthrough]];
318+
case X86::VMOVSDrr:
319+
return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01);
320+
321+
case X86::VMOVSSZrr:
322+
if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
323+
TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
324+
TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
325+
return false;
326+
[[fallthrough]];
327+
case X86::VMOVSSrr:
328+
return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01);
329+
299330
case X86::VPERMILPDri:
300331
return ProcessVPERMILPDri(X86::VSHUFPDrri);
301332
case X86::VPERMILPDYri:
@@ -573,6 +604,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
573604
bool Changed = false;
574605
ST = &MF.getSubtarget<X86Subtarget>();
575606
TII = ST->getInstrInfo();
607+
TRI = ST->getRegisterInfo();
576608
SM = &ST->getSchedModel();
577609

578610
for (MachineBasicBlock &MBB : MF) {

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)),
39043904

39053905
multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
39063906
X86VectorVTInfo _, Predicate prd = HasAVX512> {
3907-
let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
3907+
let Predicates = [prd] in {
39083908
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
39093909
(ins _.RC:$src1, _.RC:$src2),
39103910
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
39113911
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
39123912
_.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
3913-
let Predicates = [prd] in {
39143913
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
39153914
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
39163915
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
43944393
(VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
43954394
VR128X:$src1, VR128X:$src2), 0>;
43964395

4397-
let Predicates = [HasAVX512, OptForSize] in {
4396+
let Predicates = [HasAVX512] in {
43984397
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
43994398
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
44004399
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in {
44204419
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
44214420
}
44224421

4423-
// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
4424-
// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
4425-
let Predicates = [HasAVX512, OptForSpeed] in {
4426-
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
4427-
(SUBREG_TO_REG (i32 0),
4428-
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
4429-
(v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
4430-
(i8 1))), sub_xmm)>;
4431-
def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
4432-
(SUBREG_TO_REG (i32 0),
4433-
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
4434-
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
4435-
(i8 3))), sub_xmm)>;
4436-
}
4437-
44384422
let Predicates = [HasAVX512] in {
44394423
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
44404424
(VMOVSSZrm addr:$src)>;

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
90739073
case X86::VPBLENDWYrmi:
90749074
case X86::VPBLENDWYrri:
90759075
return GetBlendDomains(8, false);
9076+
case X86::VMOVSSZrr:
9077+
// Only convert to BLEND if we are VEX compatible.
9078+
if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
9079+
RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
9080+
RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9081+
return 0;
9082+
[[fallthrough]];
9083+
case X86::MOVSSrr:
9084+
case X86::VMOVSSrr:
9085+
if (Subtarget.hasSSE41())
9086+
return 0x2 | 0x8; // PackedSingle | PackedInt
9087+
return 0x2; // PackedSingle
9088+
case X86::VMOVSDZrr:
9089+
// Only convert to BLEND if we are VEX compatible.
9090+
if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
9091+
RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
9092+
RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9093+
return 0;
9094+
[[fallthrough]];
9095+
case X86::MOVSDrr:
9096+
case X86::VMOVSDrr:
9097+
if (Subtarget.hasSSE41())
9098+
return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt
9099+
return 0x4; // PackedDouble
90769100
case X86::VPANDDZ128rr:
90779101
case X86::VPANDDZ128rm:
90789102
case X86::VPANDDZ256rr:
@@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
92139237
case X86::VPBLENDWYrmi:
92149238
case X86::VPBLENDWYrri:
92159239
return SetBlendDomain(16, true);
9240+
case X86::MOVSSrr:
9241+
case X86::VMOVSSrr:
9242+
case X86::VMOVSSZrr:
9243+
if (Domain == 3) { // PackedInt
9244+
MI.setDesc(
9245+
get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
9246+
MI.addOperand(MachineOperand::CreateImm(0x03));
9247+
if (Opcode == X86::VMOVSSZrr)
9248+
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
9249+
return true;
9250+
}
9251+
return Domain == 1; // PackedSingle
9252+
case X86::MOVSDrr:
9253+
case X86::VMOVSDrr:
9254+
case X86::VMOVSDZrr:
9255+
if (Domain == 1) { // PackedSingle
9256+
MI.setDesc(
9257+
get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri));
9258+
MI.addOperand(MachineOperand::CreateImm(0x03));
9259+
if (Opcode == X86::VMOVSDZrr)
9260+
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
9261+
return true;
9262+
} else if (Domain == 2) { // PackedDouble
9263+
return true;
9264+
} else if (Domain == 3) { // PackedInt
9265+
MI.setDesc(
9266+
get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
9267+
MI.addOperand(MachineOperand::CreateImm(0x0F));
9268+
if (Opcode == X86::VMOVSDZrr)
9269+
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
9270+
return true;
9271+
}
9272+
return false;
92169273
case X86::VPANDDZ128rr:
92179274
case X86::VPANDDZ128rm:
92189275
case X86::VPANDDZ256rr:

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 17 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
209209
}
210210

211211
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212-
X86MemOperand x86memop, string OpcodeStr,
213-
Domain d, Predicate pred> {
212+
X86MemOperand x86memop, string OpcodeStr, Domain d> {
214213
// AVX
215-
let Predicates = [UseAVX, OptForSize] in
216214
defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217215
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
218216
VEX, VVVV, VEX_LIG, WIG;
@@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
223221
VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
224222
// SSE1 & 2
225223
let Constraints = "$src1 = $dst" in {
226-
let Predicates = [pred, NoSSE41_Or_OptForSize] in
227224
defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
228225
"\t{$src2, $dst|$dst, $src2}", d>;
229226
}
@@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
268265
}
269266

270267
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
271-
SSEPackedSingle, UseSSE1>, TB, XS;
268+
SSEPackedSingle>, TB, XS;
272269
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
273-
SSEPackedDouble, UseSSE2>, TB, XD;
270+
SSEPackedDouble>, TB, XD;
274271

275272
let canFoldAsLoad = 1, isReMaterializable = 1 in {
276273
defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
@@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
292289
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
293290
def : Pat<(v4f64 (X86vzload64 addr:$src)),
294291
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295-
}
296292

297-
let Predicates = [UseAVX, OptForSize] in {
298293
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
299294
// MOVSS to the lower bits.
300295
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
313308
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314309
}
315310

316-
let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
317-
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
318-
// MOVSS to the lower bits.
319-
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
320-
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
321-
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
322-
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323-
}
324-
325311
let Predicates = [UseSSE2] in
326312
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
327313
(MOVSDrm addr:$src)>;
328314

329-
let Predicates = [UseSSE1] in
330-
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
331-
(MOVSSrm addr:$src)>;
315+
let Predicates = [UseSSE1] in {
316+
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
317+
(MOVSSrm addr:$src)>;
318+
319+
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
320+
// MOVSS to the lower bits.
321+
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
322+
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
323+
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
324+
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
325+
}
332326

333327
//===----------------------------------------------------------------------===//
334328
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
63826376
(VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
63836377
}
63846378

6385-
// Prefer a movss or movsd over a blendps when optimizing for size. these were
6386-
// changed to use blends because blends have better throughput on sandybridge
6387-
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6388-
let Predicates = [HasAVX, OptForSpeed] in {
6389-
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6390-
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6391-
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6392-
(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6393-
6394-
def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6395-
(VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6379+
// TODO: Remove these and let foldMemoryOperandCustom handle it?
6380+
let Predicates = [HasAVX] in {
63966381
def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
63976382
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
63986383
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
63996384
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
64006385

6401-
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6402-
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64036386
def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
64046387
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
64056388
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
64066389
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6407-
6408-
// Move low f32 and clear high bits.
6409-
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6410-
(SUBREG_TO_REG (i32 0),
6411-
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6412-
(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6413-
(i8 1))), sub_xmm)>;
6414-
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6415-
(SUBREG_TO_REG (i32 0),
6416-
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6417-
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6418-
(i8 3))), sub_xmm)>;
64196390
}
64206391

6421-
// Prefer a movss or movsd over a blendps when optimizing for size. these were
6422-
// changed to use blends because blends have better throughput on sandybridge
6423-
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6424-
let Predicates = [UseSSE41, OptForSpeed] in {
6425-
// With SSE41 we can use blends for these patterns.
6426-
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6427-
(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6428-
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6429-
(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6430-
6431-
def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6432-
(BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6392+
let Predicates = [UseSSE41] in {
64336393
def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
64346394
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
64356395
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
64366396
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
64376397

6438-
def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6439-
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
64406398
def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
64416399
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
64426400
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),

llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
298298

299299

300300
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
301-
; CHECK-LABEL: test_x86_sse41_blendpd:
302-
; CHECK: # %bb.0:
303-
; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
304-
; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
305-
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
301+
; AVX-LABEL: test_x86_sse41_blendpd:
302+
; AVX: # %bb.0:
303+
; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
304+
; AVX-NEXT: # xmm0 = xmm0[0],xmm1[1]
305+
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
306+
;
307+
; AVX512VL-LABEL: test_x86_sse41_blendpd:
308+
; AVX512VL: # %bb.0:
309+
; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
310+
; AVX512VL-NEXT: # xmm0 = xmm0[0],xmm1[1]
311+
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
306312
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
307313
ret <2 x double> %res
308314
}

llvm/test/CodeGen/X86/avx512copy-intrinsics.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
1111
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
1212
; NOAVX512MOVZXC: # %bb.0:
1313
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
14-
; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
14+
; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
1515
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
1616
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
1717
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>

llvm/test/CodeGen/X86/dpbusd.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
345345
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
346346
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
347347
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
348-
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
349348
; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
350349
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
351350
; AVX512VLVNNI-NEXT: addl %edx, %eax

llvm/test/CodeGen/X86/dpbusd_const.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
4848
; AVX512VLVNNI: # %bb.0: # %entry
4949
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
5050
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
51-
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
5251
; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
5352
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
5453
; AVX512VLVNNI-NEXT: addl %edi, %eax
@@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
130129
; AVX512VLVNNI: # %bb.0: # %entry
131130
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
132131
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
133-
; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
134-
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
135-
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
136-
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
132+
; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
133+
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm2, %xmm1
134+
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
137135
; AVX512VLVNNI-NEXT: addl %edi, %eax
138136
; AVX512VLVNNI-NEXT: retq
139137
entry:

0 commit comments

Comments
 (0)