Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions llvm/lib/Target/X86/X86FixupInstTuning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {

private:
const X86InstrInfo *TII = nullptr;
const X86RegisterInfo *TRI = nullptr;
const X86Subtarget *ST = nullptr;
const MCSchedModel *SM = nullptr;
};
Expand Down Expand Up @@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction(
return true;
};

auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool {
if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false))
return false;
LLVM_DEBUG(dbgs() << "Replacing: " << MI);
{
MI.setDesc(TII->get(BlendOpc));
MI.addOperand(MachineOperand::CreateImm(BlendImm));
}
LLVM_DEBUG(dbgs() << " With: " << MI);
return true;
};

switch (Opc) {
case X86::BLENDPDrri:
return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
Expand All @@ -296,6 +309,24 @@ bool X86FixupInstTuningPass::processInstruction(
// TODO: Add X86::VPBLENDWYrmi handling
return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);

case X86::VMOVSDZrr:
if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
return false;
[[fallthrough]];
case X86::VMOVSDrr:
return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01);

case X86::VMOVSSZrr:
if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
return false;
[[fallthrough]];
case X86::VMOVSSrr:
return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01);

case X86::VPERMILPDri:
return ProcessVPERMILPDri(X86::VSHUFPDrri);
case X86::VPERMILPDYri:
Expand Down Expand Up @@ -573,6 +604,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
ST = &MF.getSubtarget<X86Subtarget>();
TII = ST->getInstrInfo();
TRI = ST->getRegisterInfo();
SM = &ST->getSchedModel();

for (MachineBasicBlock &MBB : MF) {
Expand Down
20 changes: 2 additions & 18 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Original file line number Diff line number Diff line change
Expand Up @@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)),

multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _, Predicate prd = HasAVX512> {
let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
let Predicates = [prd] in {
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
_.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
let Predicates = [prd] in {
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
Expand Down Expand Up @@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
(VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
VR128X:$src1, VR128X:$src2), 0>;

let Predicates = [HasAVX512, OptForSize] in {
let Predicates = [HasAVX512] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
Expand All @@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in {
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
}

// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
let Predicates = [HasAVX512, OptForSpeed] in {
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
(v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
(i8 1))), sub_xmm)>;
def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
}

let Predicates = [HasAVX512] in {
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(VMOVSSZrm addr:$src)>;
Expand Down
57 changes: 57 additions & 0 deletions llvm/lib/Target/X86/X86InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
case X86::VPBLENDWYrmi:
case X86::VPBLENDWYrri:
return GetBlendDomains(8, false);
case X86::VMOVSSZrr:
// Only convert to BLEND if we are VEX compatible.
if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
return 0;
[[fallthrough]];
case X86::MOVSSrr:
case X86::VMOVSSrr:
if (Subtarget.hasSSE41())
return 0x2 | 0x8; // PackedSingle | PackedInt
return 0x2; // PackedSingle
case X86::VMOVSDZrr:
// Only convert to BLEND if we are VEX compatible.
if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
return 0;
[[fallthrough]];
case X86::MOVSDrr:
case X86::VMOVSDrr:
if (Subtarget.hasSSE41())
return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt
return 0x4; // PackedDouble
case X86::VPANDDZ128rr:
case X86::VPANDDZ128rm:
case X86::VPANDDZ256rr:
Expand Down Expand Up @@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
case X86::VPBLENDWYrmi:
case X86::VPBLENDWYrri:
return SetBlendDomain(16, true);
case X86::MOVSSrr:
case X86::VMOVSSrr:
case X86::VMOVSSZrr:
if (Domain == 3) { // PackedInt
MI.setDesc(
get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
MI.addOperand(MachineOperand::CreateImm(0x03));
if (Opcode == X86::VMOVSSZrr)
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
return true;
}
return Domain == 1; // PackedSingle
case X86::MOVSDrr:
case X86::VMOVSDrr:
case X86::VMOVSDZrr:
if (Domain == 1) { // PackedSingle
MI.setDesc(
get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri));
MI.addOperand(MachineOperand::CreateImm(0x03));
if (Opcode == X86::VMOVSDZrr)
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
return true;
} else if (Domain == 2) { // PackedDouble
return true;
} else if (Domain == 3) { // PackedInt
MI.setDesc(
get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
MI.addOperand(MachineOperand::CreateImm(0x0F));
if (Opcode == X86::VMOVSDZrr)
MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
return true;
}
return false;
case X86::VPANDDZ128rr:
case X86::VPANDDZ128rm:
case X86::VPANDDZ256rr:
Expand Down
76 changes: 17 additions & 59 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
}

multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string OpcodeStr,
Domain d, Predicate pred> {
X86MemOperand x86memop, string OpcodeStr, Domain d> {
// AVX
let Predicates = [UseAVX, OptForSize] in
defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
VEX, VVVV, VEX_LIG, WIG;
Expand All @@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
let Predicates = [pred, NoSSE41_Or_OptForSize] in
defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
"\t{$src2, $dst|$dst, $src2}", d>;
}
Expand Down Expand Up @@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}

defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
SSEPackedSingle, UseSSE1>, TB, XS;
SSEPackedSingle>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
SSEPackedDouble, UseSSE2>, TB, XD;
SSEPackedDouble>, TB, XD;

let canFoldAsLoad = 1, isReMaterializable = 1 in {
defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
Expand All @@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
}

let Predicates = [UseAVX, OptForSize] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
Expand All @@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
}

let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}

let Predicates = [UseSSE2] in
def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
(MOVSDrm addr:$src)>;

let Predicates = [UseSSE1] in
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(MOVSSrm addr:$src)>;
let Predicates = [UseSSE1] in {
def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
(MOVSSrm addr:$src)>;

// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}

//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
Expand Down Expand Up @@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
(VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
}

// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
let Predicates = [HasAVX, OptForSpeed] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;

def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
(VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
// TODO: Remove these and let foldMemoryOperandCustom handle it?
let Predicates = [HasAVX] in {
def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
(VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;

def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
(VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;

// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
(i8 1))), sub_xmm)>;
def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
(SUBREG_TO_REG (i32 0),
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
}

// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
let Predicates = [UseSSE41, OptForSpeed] in {
// With SSE41 we can use blends for these patterns.
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;

def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
(BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
let Predicates = [UseSSE41] in {
def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
(BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;

def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
(BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
(BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
Expand Down
16 changes: 11 additions & 5 deletions llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
Original file line number Diff line number Diff line change
Expand Up @@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone


define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blendpd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[1]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
; AVX-LABEL: test_x86_sse41_blendpd:
; AVX: # %bb.0:
; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
; AVX-NEXT: # xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_sse41_blendpd:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
; AVX512VL-NEXT: # xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
; NOAVX512MOVZXC: # %bb.0:
; NOAVX512MOVZXC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
; NOAVX512MOVZXC-NEXT: vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
; NOAVX512MOVZXC-NEXT: # xmm0 = xmm0[0],xmm1[1,2,3]
; NOAVX512MOVZXC-NEXT: retq # encoding: [0xc3]
%res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/X86/dpbusd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
; AVX512VLVNNI-NEXT: addl %edx, %eax
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/X86/dpbusd_const.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
; AVX512VLVNNI-NEXT: addl %edi, %eax
Expand Down Expand Up @@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm2, %xmm1
; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax
; AVX512VLVNNI-NEXT: addl %edi, %eax
; AVX512VLVNNI-NEXT: retq
entry:
Expand Down
Loading
Loading