llvm
diff --git a/‎llvm/lib/Target/RISCV/RISCVISelLowering.cpp‎
Lines changed: 76 additions & 13 deletions b/‎llvm/lib/Target/RISCV/RISCVISelLowering.cpp‎
Lines changed: 76 additions & 13 deletions
diff --git a/‎llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll‎
Lines changed: 166 additions & 0 deletions b/‎llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll‎
Lines changed: 166 additions & 0 deletions
@@ -4518,41 +4518,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
 
+  // General case: splat the first operand and slide other operands down one
+  // by one to form a vector. Alternatively, if every operand is an
+  // extraction from element 0 of a vector, we use that vector from the last
+  // extraction as the start value and slide up instead of slide down. Such that
+  // (1) we can avoid the initial splat (2) we can turn those vslide1up into
+  // vslideup of 1 later and eliminate the vector to scalar movement, which is
+  // something we cannot do with vslide1down/vslidedown.
+  // Of course, using vslide1up/vslideup might increase the register pressure,
+  // and that's why we conservatively limit to cases where every operand is an
+  // extraction from the first element.
+  SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
+  SDValue EVec;
+  bool SlideUp = false;
+  auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec,
+                       SDValue Offset, SDValue Mask, SDValue VL) -> SDValue {
+    if (SlideUp)
+      return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+                         Mask, VL, Policy);
+    return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
+                         Mask, VL, Policy);
+  };
+
+  // The reason we don't use all_of here is because we're also capturing EVec
+  // from the last non-undef operand. If the std::execution_policy of the
+  // underlying std::all_of is anything but std::sequenced_policy we might
+  // capture the wrong EVec.
+  for (SDValue V : Operands) {
+    using namespace SDPatternMatch;
+    SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero()));
+    if (!SlideUp)
+      break;
+  }
+
+  if (SlideUp) {
+    MVT EVecContainerVT = EVec.getSimpleValueType();
+    // Make sure the original vector has scalable vector type.
+    if (EVecContainerVT.isFixedLengthVector()) {
+      EVecContainerVT =
+          getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget);
+      EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget);
+    }
+
+    // Adapt EVec's type into ContainerVT.
+    if (EVecContainerVT.getVectorMinNumElements() <
+        ContainerVT.getVectorMinNumElements())
+      EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0);
+    else
+      EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0);
+
+    // Reverse the elements as we're going to slide up from the last element.
+    std::reverse(Operands.begin(), Operands.end());
+  }
+
   SDValue Vec;
   UndefCount = 0;
-  for (SDValue V : Op->ops()) {
+  for (SDValue V : Operands) {
     if (V.isUndef()) {
       UndefCount++;
       continue;
     }
 
-    // Start our sequence with a TA splat in the hopes that hardware is able to
-    // recognize there's no dependency on the prior value of our temporary
-    // register.
+    // Start our sequence with either a TA splat or extract source in the
+    // hopes that hardware is able to recognize there's no dependency on the
+    // prior value of our temporary register.
     if (!Vec) {
-      Vec = DAG.getSplatVector(VT, DL, V);
-      Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+      if (SlideUp) {
+        Vec = EVec;
+      } else {
+        Vec = DAG.getSplatVector(VT, DL, V);
+        Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
+      }
+
       UndefCount = 0;
       continue;
     }
 
     if (UndefCount) {
       const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
-      Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
-                          Vec, Offset, Mask, VL, Policy);
+      Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+                      VL);
       UndefCount = 0;
     }
-    auto OpCode =
-      VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+
+    unsigned Opcode;
+    if (VT.isFloatingPoint())
+      Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
+    else
+      Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
+
     if (!VT.isFloatingPoint())
       V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
-    Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
+    Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
                       V, Mask, VL);
   }
   if (UndefCount) {
     const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
-    Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
-                        Vec, Offset, Mask, VL, Policy);
+    Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
+                    VL);
   }
   return convertFromScalableVector(VT, Vec, DAG, Subtarget);
 }
 
@@ -1828,3 +1828,169 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
   %v7 = insertelement <8 x double> %v6, double %e7, i64 7
   ret <8 x double> %v7
 }
+
+define <4 x float> @buildvec_vfredusum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum_slideup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfredusum.vs v8, v8, v16
+; CHECK-NEXT:    vfredusum.vs v9, v10, v16
+; CHECK-NEXT:    vfredusum.vs v10, v12, v16
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
+; CHECK-NEXT:    vfmv.f.s fa3, v10
+; CHECK-NEXT:    vfredusum.vs v8, v14, v16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa3
+; CHECK-NEXT:    vfslide1up.vf v10, v9, fa4
+; CHECK-NEXT:    vfslide1up.vf v8, v10, fa5
+; CHECK-NEXT:    ret
+  %247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %248 = insertelement <4 x float> poison, float %247, i64 0
+  %250 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+  %251 = insertelement <4 x float> %248, float %250, i64 1
+  %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %253 = insertelement <4 x float> %251, float %252, i64 2
+  %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %255 = insertelement <4 x float> %253, float %254, i64 3
+  ret <4 x float> %255
+}
+
+define <8 x float> @buildvec_vfredusum_slideup_leading_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum_slideup_leading_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfredusum.vs v9, v8, v16
+; CHECK-NEXT:    vfredusum.vs v10, v10, v16
+; CHECK-NEXT:    vfredusum.vs v11, v12, v16
+; CHECK-NEXT:    vfredusum.vs v8, v14, v16
+; CHECK-NEXT:    vfmv.f.s fa5, v9
+; CHECK-NEXT:    vfmv.f.s fa4, v10
+; CHECK-NEXT:    vfmv.f.s fa3, v11
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1up.vf v8, v10, fa4
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa5
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    ret
+  %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %253 = insertelement <8 x float> poison, float %252, i64 4
+  %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+  %255 = insertelement <8 x float> %253, float %254, i64 5
+  %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %257 = insertelement <8 x float> %255, float %256, i64 6
+  %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %259 = insertelement <8 x float> %257, float %258, i64 7
+  ret <8 x float> %259
+}
+
+define <8 x float> @buildvec_vfredusum_slideup_trailing_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum_slideup_trailing_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfredusum.vs v9, v8, v16
+; CHECK-NEXT:    vfredusum.vs v10, v10, v16
+; CHECK-NEXT:    vfredusum.vs v11, v12, v16
+; CHECK-NEXT:    vfredusum.vs v8, v14, v16
+; CHECK-NEXT:    vfmv.f.s fa5, v9
+; CHECK-NEXT:    vfmv.f.s fa4, v10
+; CHECK-NEXT:    vfmv.f.s fa3, v11
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1up.vf v12, v10, fa4
+; CHECK-NEXT:    vfslide1up.vf v8, v12, fa5
+; CHECK-NEXT:    ret
+  %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %253 = insertelement <8 x float> poison, float %252, i64 0
+  %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+  %255 = insertelement <8 x float> %253, float %254, i64 1
+  %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %257 = insertelement <8 x float> %255, float %256, i64 2
+  %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %259 = insertelement <8 x float> %257, float %258, i64 3
+  ret <8 x float> %259
+}
+
+; Negative test case checking if we generate slideup only when all build_vec operands are extraction from the first vector element.
+define <8 x float> @buildvec_vfredusum_slideup_not_extract_first(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum_slideup_not_extract_first:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    vfredusum.vs v8, v8, v10
+; CHECK-NEXT:    vfredusum.vs v9, v12, v10
+; CHECK-NEXT:    vfredusum.vs v10, v14, v10
+; CHECK-NEXT:    vfmv.f.s fa5, v9
+; CHECK-NEXT:    vfmv.f.s fa4, v10
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vfslide1down.vf v8, v10, fa0
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa4
+; CHECK-NEXT:    vslidedown.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %253 = insertelement <8 x float> poison, float %252, i64 0
+  %255 = insertelement <8 x float> %253, float %start, i64 1
+  %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %257 = insertelement <8 x float> %255, float %256, i64 2
+  %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %259 = insertelement <8 x float> %257, float %258, i64 3
+  ret <8 x float> %259
+}
+
+define <8 x float> @buildvec_vfredusum_slideup_mid_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredusum_slideup_mid_undef:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfredusum.vs v9, v8, v16
+; CHECK-NEXT:    vfredusum.vs v10, v10, v16
+; CHECK-NEXT:    vfredusum.vs v11, v12, v16
+; CHECK-NEXT:    vfredusum.vs v8, v14, v16
+; CHECK-NEXT:    vfmv.f.s fa5, v9
+; CHECK-NEXT:    vfmv.f.s fa4, v10
+; CHECK-NEXT:    vfmv.f.s fa3, v11
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa3
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa4
+; CHECK-NEXT:    vfslide1up.vf v8, v10, fa5
+; CHECK-NEXT:    ret
+  %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %253 = insertelement <8 x float> poison, float %252, i64 0
+  %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+  %255 = insertelement <8 x float> %253, float %254, i64 1
+  %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %257 = insertelement <8 x float> %255, float %256, i64 6
+  %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %259 = insertelement <8 x float> %257, float %258, i64 7
+  ret <8 x float> %259
+}
+
+define <4 x float> @buildvec_vfredosum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
+; CHECK-LABEL: buildvec_vfredosum_slideup:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vfredosum.vs v8, v8, v16
+; CHECK-NEXT:    vfredosum.vs v9, v10, v16
+; CHECK-NEXT:    vfredosum.vs v10, v12, v16
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
+; CHECK-NEXT:    vfmv.f.s fa3, v10
+; CHECK-NEXT:    vfredosum.vs v8, v14, v16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa3
+; CHECK-NEXT:    vfslide1up.vf v10, v9, fa4
+; CHECK-NEXT:    vfslide1up.vf v8, v10, fa5
+; CHECK-NEXT:    ret
+  %247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
+  %248 = insertelement <4 x float> poison, float %247, i64 0
+  %250 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
+  %251 = insertelement <4 x float> %248, float %250, i64 1
+  %252 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
+  %253 = insertelement <4 x float> %251, float %252, i64 2
+  %254 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
+  %255 = insertelement <4 x float> %253, float %254, i64 3
+  ret <4 x float> %255
+}