Skip to content
89 changes: 76 additions & 13 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4513,41 +4513,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,

const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;

// General case: splat the first operand and slide other operands down one
// by one to form a vector. Alternatively, if every operand is an
// extraction from element 0 of a vector, we use that vector from the last
// extraction as the start value and slide up instead of slide down. Such that
// (1) we can avoid the initial splat (2) we can turn those vslide1up into
// vslideup of 1 later and eliminate the vector to scalar movement, which is
// something we cannot do with vslide1down/vslidedown.
// Of course, using vslide1up/vslideup might increase the register pressure,
// and that's why we conservatively limit to cases where every operand is an
// extraction from the first element.
SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, can we get away with just copying the iterator and not the storage? Does something like this work

Suggested change
SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
auto Operands = Op->op_values();

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question and I guess it's related to your llvm::reverse comment earlier: we can keep everything iterator (ranges) if we can write the following code:

for (SDValue V : (IsSlideUp ? llvm::reverse(Op->op_values()) : Op->op_values()))

but unfortunately I don't think that would be possible without some iterator type adaption, given llvm::reverse having a different type than op_values().

SDValue EVec;
bool SlideUp = false;
auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec,
SDValue Offset, SDValue Mask, SDValue VL) -> SDValue {
if (SlideUp)
return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
Mask, VL, Policy);
return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
Mask, VL, Policy);
};

// The reason we don't use all_of here is because we're also capturing EVec
// from the last non-undef operand. If the std::execution_policy of the
// underlying std::all_of is anything but std::sequenced_policy we might
// capture the wrong EVec.
for (SDValue V : Operands) {
using namespace SDPatternMatch;
SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero()));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that even we have interleaving undef "intervals", in the worst case the number of those intervals will only be one more than the number of non-undef values.

if (!SlideUp)
break;
}

if (SlideUp) {
MVT EVecContainerVT = EVec.getSimpleValueType();
// Make sure the original vector has scalable vector type.
if (EVecContainerVT.isFixedLengthVector()) {
EVecContainerVT =
getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget);
EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget);
}

// Adapt EVec's type into ContainerVT.
if (EVecContainerVT.getVectorMinNumElements() <
ContainerVT.getVectorMinNumElements())
EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0);
else
EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0);

// Reverse the elements as we're going to slide up from the last element.
std::reverse(Operands.begin(), Operands.end());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, does llvm::reverse work here?

Suggested change
std::reverse(Operands.begin(), Operands.end());
reverse(Operands);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, does llvm::reverse work here?

That was actually what I thought, but unfortunately, llvm::reverse returns an iterator range instead of reversing content in-place.

}

SDValue Vec;
UndefCount = 0;
for (SDValue V : Op->ops()) {
for (SDValue V : Operands) {
if (V.isUndef()) {
UndefCount++;
continue;
}

// Start our sequence with a TA splat in the hopes that hardware is able to
// recognize there's no dependency on the prior value of our temporary
// register.
// Start our sequence with either a TA splat or extract source in the
// hopes that hardware is able to recognize there's no dependency on the
// prior value of our temporary register.
if (!Vec) {
Vec = DAG.getSplatVector(VT, DL, V);
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
if (SlideUp) {
Vec = EVec;
} else {
Vec = DAG.getSplatVector(VT, DL, V);
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
}

UndefCount = 0;
continue;
}

if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
Vec, Offset, Mask, VL, Policy);
Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
VL);
UndefCount = 0;
}
auto OpCode =
VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;

unsigned Opcode;
if (VT.isFloatingPoint())
Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
else
Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;

if (!VT.isFloatingPoint())
V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
V, Mask, VL);
}
if (UndefCount) {
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
Vec, Offset, Mask, VL, Policy);
Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
VL);
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
}
Expand Down
166 changes: 166 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1828,3 +1828,169 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
%v7 = insertelement <8 x double> %v6, double %e7, i64 7
ret <8 x double> %v7
}

define <4 x float> @buildvec_vfredusum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredusum_slideup:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v8, v8, v16
; CHECK-NEXT: vfredusum.vs v9, v10, v16
; CHECK-NEXT: vfredusum.vs v10, v12, v16
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vfmv.f.s fa4, v9
; CHECK-NEXT: vfmv.f.s fa3, v10
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: ret
%247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%248 = insertelement <4 x float> poison, float %247, i64 0
%250 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
%251 = insertelement <4 x float> %248, float %250, i64 1
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%253 = insertelement <4 x float> %251, float %252, i64 2
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%255 = insertelement <4 x float> %253, float %254, i64 3
ret <4 x float> %255
}

define <8 x float> @buildvec_vfredusum_slideup_leading_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredusum_slideup_leading_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vfslide1up.vf v8, v10, fa4
; CHECK-NEXT: vfslide1up.vf v10, v8, fa5
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 4
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
%255 = insertelement <8 x float> %253, float %254, i64 5
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%257 = insertelement <8 x float> %255, float %256, i64 6
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%259 = insertelement <8 x float> %257, float %258, i64 7
ret <8 x float> %259
}

define <8 x float> @buildvec_vfredusum_slideup_trailing_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredusum_slideup_trailing_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vfslide1up.vf v12, v10, fa4
; CHECK-NEXT: vfslide1up.vf v8, v12, fa5
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 0
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
%255 = insertelement <8 x float> %253, float %254, i64 1
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%257 = insertelement <8 x float> %255, float %256, i64 2
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%259 = insertelement <8 x float> %257, float %258, i64 3
ret <8 x float> %259
}

; Negative test case checking if we generate slideup only when all build_vec operands are extraction from the first vector element.
define <8 x float> @buildvec_vfredusum_slideup_not_extract_first(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredusum_slideup_not_extract_first:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v10, fa0
; CHECK-NEXT: vfredusum.vs v8, v8, v10
; CHECK-NEXT: vfredusum.vs v9, v12, v10
; CHECK-NEXT: vfredusum.vs v10, v14, v10
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vrgather.vi v10, v8, 0
; CHECK-NEXT: vfslide1down.vf v8, v10, fa0
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 0
%255 = insertelement <8 x float> %253, float %start, i64 1
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%257 = insertelement <8 x float> %255, float %256, i64 2
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%259 = insertelement <8 x float> %257, float %258, i64 3
ret <8 x float> %259
}

define <8 x float> @buildvec_vfredusum_slideup_mid_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredusum_slideup_mid_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vfslide1up.vf v10, v8, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 0
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
%255 = insertelement <8 x float> %253, float %254, i64 1
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%257 = insertelement <8 x float> %255, float %256, i64 6
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%259 = insertelement <8 x float> %257, float %258, i64 7
ret <8 x float> %259
}

define <4 x float> @buildvec_vfredosum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
; CHECK-LABEL: buildvec_vfredosum_slideup:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredosum.vs v8, v8, v16
; CHECK-NEXT: vfredosum.vs v9, v10, v16
; CHECK-NEXT: vfredosum.vs v10, v12, v16
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vfmv.f.s fa4, v9
; CHECK-NEXT: vfmv.f.s fa3, v10
; CHECK-NEXT: vfredosum.vs v8, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: ret
%247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%248 = insertelement <4 x float> poison, float %247, i64 0
%250 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
%251 = insertelement <4 x float> %248, float %250, i64 1
%252 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
%253 = insertelement <4 x float> %251, float %252, i64 2
%254 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
%255 = insertelement <4 x float> %253, float %254, i64 3
ret <4 x float> %255
}
Loading
Loading