llvm
diff --git a/‎llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp‎
Lines changed: 88 additions & 54 deletions b/‎llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp‎
Lines changed: 88 additions & 54 deletions
diff --git a/‎llvm/test/CodeGen/RISCV/double_reduct.ll‎
Lines changed: 6 additions & 12 deletions b/‎llvm/test/CodeGen/RISCV/double_reduct.ll‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll‎
Lines changed: 0 additions & 4 deletions b/‎llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll‎
Lines changed: 11 additions & 15 deletions b/‎llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll‎
Lines changed: 11 additions & 15 deletions
@@ -50,6 +50,7 @@ class RISCVVLOptimizer : public MachineFunctionPass {
   StringRef getPassName() const override { return PASS_NAME; }
 
 private:
+  std::optional<const MachineOperand> getVLForUser(MachineOperand &UserOp);
   /// Returns the largest common VL MachineOperand that may be used to optimize
   /// MI. Returns std::nullopt if it failed to find a suitable VL.
   std::optional<const MachineOperand> checkUsers(MachineInstr &MI);
@@ -97,6 +98,8 @@ struct OperandInfo {
   OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
       : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {}
 
+  OperandInfo(unsigned Log2EEW) : S(State::Known), Log2EEW(Log2EEW) {}
+
   OperandInfo() : S(State::Unknown) {}
 
   bool isUnknown() const { return S == State::Unknown; }
@@ -109,6 +112,11 @@ struct OperandInfo {
            A.EMUL->second == B.EMUL->second;
   }
 
+  static bool EEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
+    assert(A.isKnown() && B.isKnown() && "Both operands must be known");
+    return A.Log2EEW == B.Log2EEW;
+  }
+
   void print(raw_ostream &OS) const {
     if (isUnknown()) {
       OS << "Unknown";
@@ -720,8 +728,8 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
 
   // Vector Reduction Operations
   // Vector Single-Width Integer Reduction Instructions
-  // The Dest and VS1 only read element 0 of the vector register. Return unknown
-  // for these. VS2 has EEW=SEW and EMUL=LMUL.
+  // The Dest and VS1 only read element 0 of the vector register. Return just
+  // the EEW for these. VS2 has EEW=SEW and EMUL=LMUL.
   case RISCV::VREDAND_VS:
   case RISCV::VREDMAX_VS:
   case RISCV::VREDMAXU_VS:
@@ -732,7 +740,7 @@ static OperandInfo getOperandInfo(const MachineOperand &MO,
   case RISCV::VREDXOR_VS: {
     if (MO.getOperandNo() == 2)
       return OperandInfo(MIVLMul, MILog2SEW);
-    return {};
+    return OperandInfo(MILog2SEW);
   }
 
   default:
@@ -1047,48 +1055,64 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
+std::optional<const MachineOperand>
+RISCVVLOptimizer::getVLForUser(MachineOperand &UserOp) {
+  const MachineInstr &UserMI = *UserOp.getParent();
+  const MCInstrDesc &Desc = UserMI.getDesc();
+
+  // Instructions like reductions may use a vector register as a scalar
+  // register. In this case, we should treat it like a scalar register which
+  // does not impact the decision on whether to optimize VL. But if there is
+  // another user of MI and it may have VL=0, we need to be sure not to reduce
+  // the VL of MI to zero when the VLOp of UserOp is may be non-zero. The most
+  // we can reduce it to is one.
+  if (isVectorOpUsedAsScalarOp(UserOp)) {
+    [[maybe_unused]] Register R = UserOp.getReg();
+    [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
+    assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
+           "Expect LMUL 1 register class for vector as scalar operands!");
+    LLVM_DEBUG(dbgs() << "    Used this operand as a scalar operand\n");
+    // VMV_X_S and VFMV_F_S do not have a VL opt which would cause an assert
+    // assert failure if we called getVLOpNum. Therefore, we will set the
+    // CommonVL in that case as 1, even if it could have been set to 0.
+    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags))
+      return MachineOperand::CreateImm(1);
+
+    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+    if (VLOp.isReg() || (VLOp.isImm() && VLOp.getImm() != 0))
+      return MachineOperand::CreateImm(1);
+    LLVM_DEBUG(dbgs() << "    Abort because could not determine VL of vector "
+                         "operand used as scalar operand\n");
+
+    return std::nullopt;
+  }
+
+  if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
+    LLVM_DEBUG(dbgs() << "    Abort due to lack of VL, assume that"
+                         " use VLMAX\n");
+    return std::nullopt;
+  }
+
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+  // Looking for an immediate or a register VL that isn't X0.
+  assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
+         "Did not expect X0 VL");
+  return VLOp;
+}
+
 std::optional<const MachineOperand>
 RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
   // FIXME: Avoid visiting each user for each time we visit something on the
   // worklist, combined with an extra visit from the outer loop. Restructure
   // along lines of an instcombine style worklist which integrates the outer
   // pass.
   bool CanReduceVL = true;
-  const MachineOperand *CommonVL = nullptr;
-  const MachineOperand One = MachineOperand::CreateImm(1);
+  std::optional<const MachineOperand> CommonVL;
   for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) {
     const MachineInstr &UserMI = *UserOp.getParent();
     LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
-
-    // Instructions like reductions may use a vector register as a scalar
-    // register. In this case, we should treat it like a scalar register which
-    // does not impact the decision on whether to optimize VL. But if there is
-    // another user of MI and it may have VL=0, we need to be sure not to reduce
-    // the VL of MI to zero when the VLOp of UserOp is may be non-zero. The most
-    // we can reduce it to is one.
-    if (isVectorOpUsedAsScalarOp(UserOp)) {
-      [[maybe_unused]] Register R = UserOp.getReg();
-      [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
-      assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
-             "Expect LMUL 1 register class for vector as scalar operands!");
-      LLVM_DEBUG(dbgs() << "    Used this operand as a scalar operand\n");
-      const MCInstrDesc &Desc = UserMI.getDesc();
-      // VMV_X_S and VFMV_F_S do not have a VL opt which would cause an assert
-      // assert failure if we called getVLOpNum. Therefore, we will set the
-      // CommonVL in that case as 1, even if it could have been set to 0.
-      if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
-        CommonVL = &One;
-        continue;
-      }
-
-      unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
-      const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-      if (VLOp.isReg() || (VLOp.isImm() && VLOp.getImm() != 0)) {
-        CommonVL = &One;
-        continue;
-      }
-    }
-
     if (mayReadPastVL(UserMI)) {
       LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
       CanReduceVL = false;
@@ -1102,45 +1126,55 @@ RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
       break;
     }
 
-    const MCInstrDesc &Desc = UserMI.getDesc();
-    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to lack of VL or SEW, assume that"
-                           " use VLMAX\n");
+    auto VLOp = getVLForUser(UserOp);
+    if (!VLOp) {
       CanReduceVL = false;
       break;
     }
 
-    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
-    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-
-    // Looking for an immediate or a register VL that isn't X0.
-    assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
-           "Did not expect X0 VL");
-
     // Use the largest VL among all the users. If we cannot determine this
     // statically, then we cannot optimize the VL.
-    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, VLOp)) {
-      CommonVL = &VLOp;
+    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, *VLOp)) {
+      CommonVL.emplace(*VLOp);
       LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
-    } else if (!RISCV::isVLKnownLE(VLOp, *CommonVL)) {
+    } else if (!RISCV::isVLKnownLE(*VLOp, *CommonVL)) {
       LLVM_DEBUG(dbgs() << "    Abort because cannot determine a common VL\n");
       CanReduceVL = false;
       break;
     }
 
-    // The SEW and LMUL of destination and source registers need to match.
+    if (!RISCVII::hasSEWOp(UserMI.getDesc().TSFlags)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to lack of SEW operand\n");
+      CanReduceVL = false;
+      break;
+    }
+
     OperandInfo ConsumerInfo = getOperandInfo(UserOp, MRI);
     OperandInfo ProducerInfo = getOperandInfo(MI.getOperand(0), MRI);
-    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() ||
-        !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) {
-      LLVM_DEBUG(dbgs() << "    Abort due to incompatible or unknown "
-                           "information for EMUL or EEW.\n");
+    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown()) {
+      LLVM_DEBUG(dbgs() << "    Abort due to unknown operand information.\n");
+      LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
+      LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    // If the operand is used as a scalar operand, then the EEW must be
+    // compatible. Otherwise, the EMUL *and* EEW must be compatible.
+    if ((isVectorOpUsedAsScalarOp(UserOp) &&
+         !OperandInfo::EEWAreEqual(ConsumerInfo, ProducerInfo)) ||
+        (!isVectorOpUsedAsScalarOp(UserOp) &&
+         !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo))) {
+      LLVM_DEBUG(
+          dbgs()
+          << "    Abort due to incompatible information for EMUL or EEW.\n");
       LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
       LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
       CanReduceVL = false;
       break;
     }
   }
+
   return CanReduceVL && CommonVL
              ? std::make_optional<const MachineOperand>(*CommonVL)
              : std::nullopt;
 
@@ -171,9 +171,8 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: and_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vredand.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -186,9 +185,8 @@ define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: or_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vredor.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -216,9 +214,8 @@ define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: umin_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vminu.vv v8, v8, v9
 ; CHECK-NEXT:    vredminu.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -231,9 +228,8 @@ define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: umax_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmaxu.vv v8, v8, v9
 ; CHECK-NEXT:    vredmaxu.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -246,9 +242,8 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: smin_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmin.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmin.vv v8, v8, v9
 ; CHECK-NEXT:    vredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -261,9 +256,8 @@ define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
 define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: smax_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmax.vv v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmax.vv v8, v8, v9
 ; CHECK-NEXT:    vredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 
@@ -12,11 +12,9 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vrsub.vi v9, v9, 4
 ; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 4
@@ -31,11 +29,9 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    vrsub.vi v9, v9, 4
 ; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    li a1, 4
 
@@ -481,7 +481,7 @@ declare <2 x i32> @llvm.masked.load.v2i32(ptr, i32, <2 x i1>, <2 x i32>)
 define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwind {
 ; RV32-SLOW-LABEL: masked_load_v2i32_align1:
 ; RV32-SLOW:       # %bb.0:
-; RV32-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vmseq.vi v8, v8, 0
 ; RV32-SLOW-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV32-SLOW-NEXT:    vmv.x.s a2, v8
@@ -499,7 +499,7 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-SLOW-NEXT:    slli a6, a6, 24
 ; RV32-SLOW-NEXT:    or a4, a6, a5
 ; RV32-SLOW-NEXT:    or a3, a4, a3
-; RV32-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vmv.v.x v8, a3
 ; RV32-SLOW-NEXT:  .LBB8_2: # %else
 ; RV32-SLOW-NEXT:    andi a2, a2, 2
@@ -515,19 +515,17 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-SLOW-NEXT:    slli a0, a0, 24
 ; RV32-SLOW-NEXT:    or a0, a0, a4
 ; RV32-SLOW-NEXT:    or a0, a0, a2
-; RV32-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vmv.s.x v9, a0
 ; RV32-SLOW-NEXT:    vslideup.vi v8, v9, 1
-; RV32-SLOW-NEXT:    vse32.v v8, (a1)
-; RV32-SLOW-NEXT:    ret
-; RV32-SLOW-NEXT:  .LBB8_4:
-; RV32-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-SLOW-NEXT:  .LBB8_4: # %else2
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vse32.v v8, (a1)
 ; RV32-SLOW-NEXT:    ret
 ;
 ; RV64-SLOW-LABEL: masked_load_v2i32_align1:
 ; RV64-SLOW:       # %bb.0:
-; RV64-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmseq.vi v8, v8, 0
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a2, v8
@@ -545,7 +543,7 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-SLOW-NEXT:    slli a6, a6, 24
 ; RV64-SLOW-NEXT:    or a4, a6, a5
 ; RV64-SLOW-NEXT:    or a3, a4, a3
-; RV64-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.v.x v8, a3
 ; RV64-SLOW-NEXT:  .LBB8_2: # %else
 ; RV64-SLOW-NEXT:    andi a2, a2, 2
@@ -561,13 +559,11 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-SLOW-NEXT:    slli a0, a0, 24
 ; RV64-SLOW-NEXT:    or a0, a0, a4
 ; RV64-SLOW-NEXT:    or a0, a0, a2
-; RV64-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.s.x v9, a0
 ; RV64-SLOW-NEXT:    vslideup.vi v8, v9, 1
-; RV64-SLOW-NEXT:    vse32.v v8, (a1)
-; RV64-SLOW-NEXT:    ret
-; RV64-SLOW-NEXT:  .LBB8_4:
-; RV64-SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-SLOW-NEXT:  .LBB8_4: # %else2
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vse32.v v8, (a1)
 ; RV64-SLOW-NEXT:    ret
 ;
@@ -589,7 +585,7 @@ declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>)
 define void @masked_store_v2i32_align2(<2 x i32> %val, ptr %a, <2 x i32> %m) nounwind {
 ; SLOW-LABEL: masked_store_v2i32_align2:
 ; SLOW:       # %bb.0:
-; SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; SLOW-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; SLOW-NEXT:    vmseq.vi v9, v9, 0
 ; SLOW-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; SLOW-NEXT:    vmv.x.s a1, v9