llvm · RKSimon · Jun 16, 2025
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -54,6 +54,7 @@ class X86FixupInstTuningPass : public MachineFunctionPass {
 
 private:
   const X86InstrInfo *TII = nullptr;
+  const X86RegisterInfo *TRI = nullptr;
   const X86Subtarget *ST = nullptr;
   const MCSchedModel *SM = nullptr;
 };
@@ -277,6 +278,18 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
+  auto ProcessMOVToBLEND = [&](unsigned BlendOpc, unsigned BlendImm) -> bool {
+    if (OptSize || !NewOpcPreferable(BlendOpc, /*ReplaceInTie*/ false))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(BlendOpc));
+      MI.addOperand(MachineOperand::CreateImm(BlendImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return true;
+  };
+
   switch (Opc) {
   case X86::BLENDPDrri:
     return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -296,6 +309,24 @@ bool X86FixupInstTuningPass::processInstruction(
     // TODO: Add X86::VPBLENDWYrmi handling
     return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
 
+  case X86::VMOVSDZrr:
+    if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return false;
+    [[fallthrough]];
+  case X86::VMOVSDrr:
+    return ProcessMOVToBLEND(X86::VBLENDPDrri, 0x01);
+
+  case X86::VMOVSSZrr:
+    if (TRI->getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        TRI->getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return false;
+    [[fallthrough]];
+  case X86::VMOVSSrr:
+    return ProcessMOVToBLEND(X86::VBLENDPSrri, 0x01);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
@@ -573,6 +604,7 @@ bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   ST = &MF.getSubtarget<X86Subtarget>();
   TII = ST->getInstrInfo();
+  TRI = ST->getRegisterInfo();
   SM = &ST->getSchedModel();
 
   for (MachineBasicBlock &MBB : MF) {

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3904,13 +3904,12 @@ def : Pat<(f64 (bitconvert VK64:$src)),
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
                               X86VectorVTInfo _, Predicate prd = HasAVX512> {
-  let Predicates = !if (!eq (prd, HasFP16), [HasFP16], [prd, OptForSize]) in
+  let Predicates = [prd] in {
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
              _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
-  let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
@@ -4394,7 +4393,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
                 (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
                                  VR128X:$src1, VR128X:$src2), 0>;
 
-let Predicates = [HasAVX512, OptForSize] in {
+let Predicates = [HasAVX512] in {
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
             (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4420,21 +4419,6 @@ let Predicates = [HasAVX512, OptForSize] in {
               (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
 }
 
-// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
-// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31.
-let Predicates = [HasAVX512, OptForSpeed] in {
-  def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
-                          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)),
-                          (i8 1))), sub_xmm)>;
-  def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
-                          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
-                          (i8 3))), sub_xmm)>;
-}
-
 let Predicates = [HasAVX512] in {
   def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
             (VMOVSSZrm addr:$src)>;

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -9073,6 +9073,30 @@ uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return GetBlendDomains(8, false);
+  case X86::VMOVSSZrr:
+    // Only convert to BLEND if we are VEX compatible.
+    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return 0;
+    [[fallthrough]];
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+    if (Subtarget.hasSSE41())
+      return 0x2 | 0x8; // PackedSingle | PackedInt
+    return 0x2;         // PackedSingle
+  case X86::VMOVSDZrr:
+    // Only convert to BLEND if we are VEX compatible.
+    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16 ||
+        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
+      return 0;
+    [[fallthrough]];
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+    if (Subtarget.hasSSE41())
+      return 0x2 | 0x4 | 0x8; // PackedSingle | PackedDouble | PackedInt
+    return 0x4;               // PackedDouble
   case X86::VPANDDZ128rr:
   case X86::VPANDDZ128rm:
   case X86::VPANDDZ256rr:
@@ -9213,6 +9237,39 @@ bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
   case X86::VPBLENDWYrmi:
   case X86::VPBLENDWYrri:
     return SetBlendDomain(16, true);
+  case X86::MOVSSrr:
+  case X86::VMOVSSrr:
+  case X86::VMOVSSZrr:
+    if (Domain == 3) { // PackedInt
+      MI.setDesc(
+          get(Opcode == X86::MOVSSrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+      MI.addOperand(MachineOperand::CreateImm(0x03));
+      if (Opcode == X86::VMOVSSZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    }
+    return Domain == 1; // PackedSingle
+  case X86::MOVSDrr:
+  case X86::VMOVSDrr:
+  case X86::VMOVSDZrr:
+    if (Domain == 1) { // PackedSingle
+      MI.setDesc(
+          get(Opcode == X86::MOVSDrr ? X86::BLENDPSrri : X86::VBLENDPSrri));
+      MI.addOperand(MachineOperand::CreateImm(0x03));
+      if (Opcode == X86::VMOVSDZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    } else if (Domain == 2) { // PackedDouble
+      return true;
+    } else if (Domain == 3) { // PackedInt
+      MI.setDesc(
+          get(Opcode == X86::MOVSDrr ? X86::PBLENDWrri : X86::VPBLENDWrri));
+      MI.addOperand(MachineOperand::CreateImm(0x0F));
+      if (Opcode == X86::VMOVSDZrr)
+        MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
+      return true;
+    }
+    return false;
   case X86::VPANDDZ128rr:
   case X86::VPANDDZ128rm:
   case X86::VPANDDZ256rr:

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -209,10 +209,8 @@ multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
-                      X86MemOperand x86memop, string OpcodeStr,
-                      Domain d, Predicate pred> {
+                      X86MemOperand x86memop, string OpcodeStr, Domain d> {
   // AVX
-  let Predicates = [UseAVX, OptForSize] in
   defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
                               VEX, VVVV, VEX_LIG, WIG;
@@ -223,7 +221,6 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                      VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
-    let Predicates = [pred, NoSSE41_Or_OptForSize] in
     defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $dst|$dst, $src2}", d>;
   }
@@ -268,9 +265,9 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle, UseSSE1>, TB, XS;
+                        SSEPackedSingle>, TB, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble, UseSSE2>, TB, XD;
+                        SSEPackedDouble>, TB, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
@@ -292,9 +289,7 @@ let Predicates = [UseAVX] in {
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzload64 addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
-}
 
-let Predicates = [UseAVX, OptForSize] in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
@@ -313,22 +308,21 @@ let Predicates = [UseAVX, OptForSize] in {
               (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
 }
 
-let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
-// Move scalar to XMM zero-extended, zeroing a VR128 then do a
-// MOVSS to the lower bits.
-def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
-def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
-}
-
 let Predicates = [UseSSE2] in
 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
           (MOVSDrm addr:$src)>;
 
-let Predicates = [UseSSE1] in
-def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-          (MOVSSrm addr:$src)>;
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (MOVSSrm addr:$src)>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
@@ -6382,61 +6376,25 @@ let Predicates = [HasAVX] in {
             (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 }
 
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [HasAVX, OptForSpeed] in {
-  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
-  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+// TODO: Remove these and let foldMemoryOperandCustom handle it?
+let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
             (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
 
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
             (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
-
-  // Move low f32 and clear high bits.
-  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
-                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
-                          (i8 1))), sub_xmm)>;
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
-                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
-                          (i8 3))), sub_xmm)>;
 }
 
-// Prefer a movss or movsd over a blendps when optimizing for size. these were
-// changed to use blends because blends have better throughput on sandybridge
-// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseSSE41, OptForSpeed] in {
-  // With SSE41 we can use blends for these patterns.
-  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
-  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
-
-  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
+let Predicates = [UseSSE41] in {
   def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
             (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
 
-  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
             (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
   def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),

diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -298,11 +298,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
-; CHECK-NEXT:    # xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_sse41_blendpd:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_sse41_blendpd:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX512VL-NEXT:    # xmm0 = xmm0[0],xmm1[1]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }

diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
 ; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
 ; NOAVX512MOVZXC:       # %bb.0:
 ; NOAVX512MOVZXC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
+; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf2,0x10,0xc0]
 ; NOAVX512MOVZXC-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOAVX512MOVZXC-NEXT:    retq # encoding: [0xc3]
   %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>

diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -345,7 +345,6 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
 ; AVX512VLVNNI-NEXT:    addl %edx, %eax

diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -48,7 +48,6 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLVNNI-NEXT:    addl %edi, %eax
@@ -130,10 +129,9 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
-; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
+; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm2, %xmm1
+; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLVNNI-NEXT:    addl %edi, %eax
 ; AVX512VLVNNI-NEXT:    retq
 entry: