llvm · bjope · Sep 17, 2025 · Aug 14, 2025 · Sep 17, 2025 · RKSimon
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1885,6 +1885,12 @@ LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V);
 /// If \p V is not an extracted subvector, it is returned as-is.
 LLVM_ABI SDValue peekThroughExtractSubvectors(SDValue V);
 
+/// Recursively peek through INSERT_VECTOR_ELT nodes, returning the source
+/// vector operand of \p V, as long as \p V is an INSERT_VECTOR_ELT operation
+/// that do not insert into any of the demanded vector elts.
+LLVM_ABI SDValue peekThroughInsertVectorElt(SDValue V,
+                                            const APInt &DemandedElts);
+
 /// Return the non-truncated source operand of \p V if it exists.
 /// If \p V is not a truncation, it is returned as-is.
 LLVM_ABI SDValue peekThroughTruncates(SDValue V);

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16892,6 +16892,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   bool AllowMultipleMaybePoisonOperands =
       N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
       N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::INSERT_SUBVECTOR ||
       N0.getOpcode() == ISD::BUILD_PAIR ||
       N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
       N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
@@ -23363,6 +23364,13 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
       InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
     return InVec;
 
+  // Remove insert of UNDEF/POISON elements.
+  if (InVal.isUndef()) {
+    if (InVal.getOpcode() == ISD::POISON || InVec.getOpcode() == ISD::UNDEF)
+      return InVec;
+    return DAG.getFreeze(InVec);
+  }
+
   if (!IndexC) {
     // If this is variable insert to undef vector, it might be better to splat:
     // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
@@ -27852,18 +27860,34 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   uint64_t InsIdx = N->getConstantOperandVal(2);
 
-  // If inserting an UNDEF, just return the original vector.
-  if (N1.isUndef())
-    return N0;
+  // Remove insert of UNDEF/POISON.
+  if (N1.isUndef()) {
+    if (N1.getOpcode() == ISD::POISON || N0.getOpcode() == ISD::UNDEF)
+      return N0;
+    return DAG.getFreeze(N0);
+  }
 
-  // If this is an insert of an extracted vector into an undef vector, we can
-  // just use the input to the extract if the types match, and can simplify
+  // If this is an insert of an extracted vector into an undef/poison vector, we
+  // can just use the input to the extract if the types match, and can simplify
   // in some cases even if they don't.
   if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOperand(1) == N2) {
+    EVT N1VT = N1.getValueType();
     EVT SrcVT = N1.getOperand(0).getValueType();
-    if (SrcVT == VT)
-      return N1.getOperand(0);
+    if (SrcVT == VT) {
+      // Need to ensure that result isn't more poisonous if skipping both the
+      // extract+insert.
+      if (N0.getOpcode() == ISD::POISON)
+        return N1.getOperand(0);
+      if (VT.isFixedLengthVector() && N1VT.isFixedLengthVector()) {
+        unsigned SubVecNumElts = N1VT.getVectorNumElements();
+        APInt EltMask = APInt::getBitsSet(VT.getVectorNumElements(), InsIdx,
+                                          InsIdx + SubVecNumElts);
+        if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0), ~EltMask))
+          return N1.getOperand(0);
+      } else if (DAG.isGuaranteedNotToBePoison(N1.getOperand(0)))
+        return N1.getOperand(0);
+    }
     // TODO: To remove the zero check, need to adjust the offset to
     // a multiple of the new src type.
     if (isNullConstant(N2)) {

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5525,8 +5525,9 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
       APInt InVecDemandedElts = DemandedElts;
       InVecDemandedElts.clearBit(IndexC->getZExtValue());
       if (!!InVecDemandedElts &&
-          !isGuaranteedNotToBeUndefOrPoison(InVec, InVecDemandedElts,
-                                            PoisonOnly, Depth + 1))
+          !isGuaranteedNotToBeUndefOrPoison(
+              peekThroughInsertVectorElt(InVec, InVecDemandedElts),
+              InVecDemandedElts, PoisonOnly, Depth + 1))
         return false;
       return true;
     }
@@ -8255,15 +8256,34 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N3.isUndef())
       return getUNDEF(VT);
 
-    // If the inserted element is an UNDEF, just use the input vector.
-    if (N2.isUndef())
+    // If inserting poison, just use the input vector.
+    if (N2.getOpcode() == ISD::POISON)
       return N1;
 
+    // Inserting undef into undef/poison is still undef.
+    if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
+      return getUNDEF(VT);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    // But not if skipping the insert could make the result more poisonous.
+    if (N2.isUndef()) {
+      if (N3C && VT.isFixedLengthVector()) {
+        APInt EltMask =
+            APInt::getOneBitSet(VT.getVectorNumElements(), N3C->getZExtValue());
+        if (isGuaranteedNotToBePoison(N1, EltMask))
+          return N1;
+      } else if (isGuaranteedNotToBePoison(N1))
+        return N1;
+    }
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
-    // Inserting undef into undef is still undef.
-    if (N1.isUndef() && N2.isUndef())
+    // If inserting poison, just use the input vector,
+    if (N2.getOpcode() == ISD::POISON)
+      return N1;
+
+    // Inserting undef into undef/poison is still undef.
+    if (N2.getOpcode() == ISD::UNDEF && N1.isUndef())
       return getUNDEF(VT);
 
     EVT N2VT = N2.getValueType();
@@ -8292,11 +8312,37 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (VT == N2VT)
       return N2;
 
-    // If this is an insert of an extracted vector into an undef vector, we
-    // can just use the input to the extract.
+    // If this is an insert of an extracted vector into an undef/poison vector,
+    // we can just use the input to the extract. But not if skipping the
+    // extract+insert could make the result more poisonous.
     if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
-      return N2.getOperand(0);
+        N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) {
+      if (N1.getOpcode() == ISD::POISON)
+        return N2.getOperand(0);
+      if (VT.isFixedLengthVector() && N2VT.isFixedLengthVector()) {
+        unsigned LoBit = N3->getAsZExtVal();
+        unsigned HiBit = LoBit + N2VT.getVectorNumElements();
+        APInt EltMask =
+            APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit);
+        if (isGuaranteedNotToBePoison(N2.getOperand(0), ~EltMask))
+          return N2.getOperand(0);
+      } else if (isGuaranteedNotToBePoison(N2.getOperand(0)))
+        return N2.getOperand(0);
+    }
+
+    // If the inserted subvector is UNDEF, just use the input vector.
+    // But not if skipping the insert could make the result more poisonous.
+    if (N2.isUndef()) {
+      if (VT.isFixedLengthVector()) {
+        unsigned LoBit = N3->getAsZExtVal();
+        unsigned HiBit = LoBit + N2VT.getVectorNumElements();
+        APInt EltMask =
+            APInt::getBitsSet(VT.getVectorNumElements(), LoBit, HiBit);
+        if (isGuaranteedNotToBePoison(N1, EltMask))
+          return N1;
+      } else if (isGuaranteedNotToBePoison(N1))
+        return N1;
+    }
     break;
   }
   case ISD::BITCAST:
@@ -12778,6 +12824,23 @@ SDValue llvm::peekThroughExtractSubvectors(SDValue V) {
   return V;
 }
 
+SDValue llvm::peekThroughInsertVectorElt(SDValue V, const APInt &DemandedElts) {
+  while (V.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    SDValue InVec = V.getOperand(0);
+    SDValue EltNo = V.getOperand(2);
+    EVT VT = InVec.getValueType();
+    auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+    if (IndexC && VT.isFixedLengthVector() &&
+        IndexC->getAPIntValue().ult(VT.getVectorNumElements()) &&
+        !DemandedElts[IndexC->getZExtValue()]) {
+      V = InVec;
+      continue;
+    }
+    break;
+  }
+  return V;
+}
+
 SDValue llvm::peekThroughTruncates(SDValue V) {
   while (V.getOpcode() == ISD::TRUNCATE)
     V = V.getOperand(0);

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3418,8 +3418,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
-    // Demand any elements from the subvector and the remainder from the src its
-    // inserted into.
+    // Demand any elements from the subvector and the remainder from the src it
+    // is inserted into.
     SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
     uint64_t Idx = Op.getConstantOperandVal(2);
@@ -3428,6 +3428,10 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     APInt DemandedSrcElts = DemandedElts;
     DemandedSrcElts.clearBits(Idx, Idx + NumSubElts);
 
+    // If none of the sub operand elements are demanded, bypass the insert.
+    if (!DemandedSubElts)
+      return TLO.CombineTo(Op, Src);
+
     APInt SubUndef, SubZero;
     if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
                                    Depth + 1))

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15676,7 +15676,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     for (unsigned i = 0; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
-      if (!isIntOrFPConstant(V))
+      if (!isIntOrFPConstant(V) && !V.isUndef())
         // Note that type legalization likely mucked about with the VT of the
         // source operand, so we may have to convert it here before inserting.
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);

diff --git a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll
@@ -94,16 +94,14 @@ define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unname
 ; CHECK-LABEL: combine_undef_add_8xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    mov v1.s[1], w1
-; CHECK-NEXT:    uhadd v0.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    mov v1.s[2], w2
 ; CHECK-NEXT:    mov v1.s[3], w3
-; CHECK-NEXT:    xtn v2.4h, v1.4s
-; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
-; CHECK-NEXT:    uhadd v1.4h, v2.4h, v1.4h
-; CHECK-NEXT:    mov v1.d[1], v0.d[0]
-; CHECK-NEXT:    uaddlv s0, v1.8h
+; CHECK-NEXT:    uzp2 v2.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    uhadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    uaddlv s0, v0.8h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %a1 = insertelement <8 x i32> poison, i32 %a, i32 0

diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -1198,11 +1198,15 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p2.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x2]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x1]
 ; CHECK-NEXT:    punpklo p2.h, p1.b
+; CHECK-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
+; CHECK-NEXT:    and z1.s, z1.s, #0x1
+; CHECK-NEXT:    cmpne p1.s, p1/z, z1.s, #0
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x2]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -1,27 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=1 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING1
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=2 | FileCheck %s --check-prefixes=NO_FOLDING,NO_FOLDING2
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFH
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN
 ; Check that the default value enables the web folding and
 ; that it is bigger than 3.
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING
 
 define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) {
-; NO_FOLDING-LABEL: vfwmul_v2f116_multiple_users:
-; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
-; NO_FOLDING-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; NO_FOLDING-NEXT:    vfmul.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vfadd.vv v11, v11, v9
-; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
-; NO_FOLDING-NEXT:    vse32.v v10, (a0)
-; NO_FOLDING-NEXT:    vse32.v v11, (a1)
-; NO_FOLDING-NEXT:    vse32.v v8, (a2)
-; NO_FOLDING-NEXT:    ret
+; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users:
+; NO_FOLDING1:       # %bb.0:
+; NO_FOLDING1-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING1-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; NO_FOLDING1-NEXT:    vfmul.vv v10, v11, v8
+; NO_FOLDING1-NEXT:    vfadd.vv v11, v11, v9
+; NO_FOLDING1-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING1-NEXT:    vse32.v v10, (a0)
+; NO_FOLDING1-NEXT:    vse32.v v11, (a1)
+; NO_FOLDING1-NEXT:    vse32.v v8, (a2)
+; NO_FOLDING1-NEXT:    ret
+;
+; NO_FOLDING2-LABEL: vfwmul_v2f116_multiple_users:
+; NO_FOLDING2:       # %bb.0:
+; NO_FOLDING2-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; NO_FOLDING2-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING2-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING2-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; NO_FOLDING2-NEXT:    vfmul.vv v9, v11, v8
+; NO_FOLDING2-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; NO_FOLDING2-NEXT:    vfwadd.wv v11, v11, v10
+; NO_FOLDING2-NEXT:    vfwsub.wv v8, v8, v10
+; NO_FOLDING2-NEXT:    vse32.v v9, (a0)
+; NO_FOLDING2-NEXT:    vse32.v v11, (a1)
+; NO_FOLDING2-NEXT:    vse32.v v8, (a2)
+; NO_FOLDING2-NEXT:    ret
 ;
 ; ZVFH-LABEL: vfwmul_v2f116_multiple_users:
 ; ZVFH:       # %bb.0:
@@ -61,20 +76,35 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a,
 }
 
 define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, <2 x float> %b, <2 x float> %b2) {
-; NO_FOLDING-LABEL: vfwmul_v2f32_multiple_users:
-; NO_FOLDING:       # %bb.0:
-; NO_FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v11, v8
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v8, v9
-; NO_FOLDING-NEXT:    vfwcvt.f.f.v v9, v10
-; NO_FOLDING-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; NO_FOLDING-NEXT:    vfmul.vv v10, v11, v8
-; NO_FOLDING-NEXT:    vfadd.vv v11, v11, v9
-; NO_FOLDING-NEXT:    vfsub.vv v8, v8, v9
-; NO_FOLDING-NEXT:    vse64.v v10, (a0)
-; NO_FOLDING-NEXT:    vse64.v v11, (a1)
-; NO_FOLDING-NEXT:    vse64.v v8, (a2)
-; NO_FOLDING-NEXT:    ret
+; NO_FOLDING1-LABEL: vfwmul_v2f32_multiple_users:
+; NO_FOLDING1:       # %bb.0:
+; NO_FOLDING1-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING1-NEXT:    vfwcvt.f.f.v v9, v10
+; NO_FOLDING1-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING1-NEXT:    vfmul.vv v10, v11, v8
+; NO_FOLDING1-NEXT:    vfadd.vv v11, v11, v9
+; NO_FOLDING1-NEXT:    vfsub.vv v8, v8, v9
+; NO_FOLDING1-NEXT:    vse64.v v10, (a0)
+; NO_FOLDING1-NEXT:    vse64.v v11, (a1)
+; NO_FOLDING1-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING1-NEXT:    ret
+;
+; NO_FOLDING2-LABEL: vfwmul_v2f32_multiple_users:
+; NO_FOLDING2:       # %bb.0:
+; NO_FOLDING2-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO_FOLDING2-NEXT:    vfwcvt.f.f.v v11, v8
+; NO_FOLDING2-NEXT:    vfwcvt.f.f.v v8, v9
+; NO_FOLDING2-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; NO_FOLDING2-NEXT:    vfmul.vv v9, v11, v8
+; NO_FOLDING2-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; NO_FOLDING2-NEXT:    vfwadd.wv v11, v11, v10
+; NO_FOLDING2-NEXT:    vfwsub.wv v8, v8, v10
+; NO_FOLDING2-NEXT:    vse64.v v9, (a0)
+; NO_FOLDING2-NEXT:    vse64.v v11, (a1)
+; NO_FOLDING2-NEXT:    vse64.v v8, (a2)
+; NO_FOLDING2-NEXT:    ret
 ;
 ; FOLDING-LABEL: vfwmul_v2f32_multiple_users:
 ; FOLDING:       # %bb.0: