Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21151,6 +21151,36 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return N->getOperand(0);
break;
}
case RISCVISD::VSLIDE1UP_VL:
case RISCVISD::VFSLIDE1UP_VL: {
using namespace SDPatternMatch;
SDValue SrcVec;
SDLoc DL(N);
MVT VT = N->getSimpleValueType(0);
// If the scalar we're sliding in was extracted from the first element of a
// vector, we can use that vector as the passthru in a normal slideup of 1.
// This saves us an extract_element instruction (i.e. vfmv.f.s, vmv.x.s).
if (N->getOperand(0).isUndef() &&
sd_match(N->getOperand(2),
m_AnyOf(m_ExtractElt(m_Value(SrcVec), m_Zero()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to extract from a srcvec where the element size is smaller? IIRC extractelt also extends the result if needed.

Can we get an extending extractelt from LLVM IR via a extractelement + zext?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does extractelt have implicit zext/sext like loads? If it's extracting from, say a i32 vector to a i64 scalar, I think an additional zext SDNode will be applied on that i64 before any use.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

say a i32 vector to a i64 scalar, I think an additional zext SDNode will be applied on that i64 before any use.

Well, I was only half-right about that: for extractelt generated from IR or any normal ways, that's the case. But for extractelt generated during legalization no additional zext/sext would be added because they're subject to be lowered into vmv.x.s which sign-extends its result.
I, therefore, added a check to make sure the element type of SrcVecis the same as that of vslide1up

m_Node(RISCVISD::VMV_X_S, m_Value(SrcVec))))) {
MVT SrcVecVT = SrcVec.getSimpleValueType();
// Adapt the value type of source vector.
if (SrcVecVT.isFixedLengthVector()) {
SrcVecVT = getContainerForFixedLengthVector(SrcVecVT);
SrcVec = convertToScalableVector(SrcVecVT, SrcVec, DAG, Subtarget);
}
if (SrcVecVT.getVectorMinNumElements() < VT.getVectorMinNumElements())
SrcVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(VT), SrcVec, 0);
else
SrcVec = DAG.getExtractSubvector(DL, VT, SrcVec, 0);

return getVSlideup(DAG, Subtarget, DL, VT, SrcVec, N->getOperand(1),
DAG.getConstant(1, DL, XLenVT), N->getOperand(3),
N->getOperand(4));
}
break;
}
}

return SDValue();
Expand Down
85 changes: 38 additions & 47 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1837,14 +1837,11 @@ define <4 x float> @buildvec_vfredusum_slideup(float %start, <8 x float> %arg1,
; CHECK-NEXT: vfredusum.vs v8, v8, v16
; CHECK-NEXT: vfredusum.vs v9, v10, v16
; CHECK-NEXT: vfredusum.vs v10, v12, v16
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vfmv.f.s fa4, v9
; CHECK-NEXT: vfmv.f.s fa3, v10
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: vfredusum.vs v11, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vslideup.vi v10, v11, 1
; CHECK-NEXT: vslideup.vi v9, v10, 1
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%248 = insertelement <4 x float> poison, float %247, i64 0
Expand All @@ -1861,18 +1858,17 @@ define <8 x float> @buildvec_vfredusum_slideup_leading_undef(float %start, <8 x
; CHECK-LABEL: buildvec_vfredusum_slideup_leading_undef:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vfslide1up.vf v8, v10, fa4
; CHECK-NEXT: vfslide1up.vf v10, v8, fa5
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vfmv.s.f v17, fa0
; CHECK-NEXT: vfredusum.vs v16, v8, v17
; CHECK-NEXT: vfredusum.vs v8, v10, v17
; CHECK-NEXT: vfredusum.vs v10, v12, v17
; CHECK-NEXT: vfredusum.vs v12, v14, v17
; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v12, 1
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: vslideup.vi v16, v8, 1
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v16, 4
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 4
Expand All @@ -1890,16 +1886,14 @@ define <8 x float> @buildvec_vfredusum_slideup_trailing_undef(float %start, <8 x
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v8, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vfslide1up.vf v12, v10, fa4
; CHECK-NEXT: vfslide1up.vf v8, v12, fa5
; CHECK-NEXT: vfredusum.vs v12, v12, v16
; CHECK-NEXT: vfredusum.vs v14, v14, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v12, v14, 1
; CHECK-NEXT: vslideup.vi v10, v12, 1
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 0
Expand Down Expand Up @@ -1944,17 +1938,17 @@ define <8 x float> @buildvec_vfredusum_slideup_mid_undef(float %start, <8 x floa
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vfmv.s.f v16, fa0
; CHECK-NEXT: vfredusum.vs v9, v8, v16
; CHECK-NEXT: vfredusum.vs v8, v8, v16
; CHECK-NEXT: vfredusum.vs v10, v10, v16
; CHECK-NEXT: vfredusum.vs v11, v12, v16
; CHECK-NEXT: vfredusum.vs v8, v14, v16
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vfmv.f.s fa4, v10
; CHECK-NEXT: vfmv.f.s fa3, v11
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vfslide1up.vf v10, v8, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: vfredusum.vs v12, v12, v16
; CHECK-NEXT: vfredusum.vs v14, v14, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v12, v14, 1
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v14, v12, 4
; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma
; CHECK-NEXT: vslideup.vi v10, v14, 1
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: ret
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%253 = insertelement <8 x float> poison, float %252, i64 0
Expand All @@ -1975,14 +1969,11 @@ define <4 x float> @buildvec_vfredosum_slideup(float %start, <8 x float> %arg1,
; CHECK-NEXT: vfredosum.vs v8, v8, v16
; CHECK-NEXT: vfredosum.vs v9, v10, v16
; CHECK-NEXT: vfredosum.vs v10, v12, v16
; CHECK-NEXT: vfmv.f.s fa5, v8
; CHECK-NEXT: vfmv.f.s fa4, v9
; CHECK-NEXT: vfmv.f.s fa3, v10
; CHECK-NEXT: vfredosum.vs v8, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
; CHECK-NEXT: vfredosum.vs v11, v14, v16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; CHECK-NEXT: vslideup.vi v10, v11, 1
; CHECK-NEXT: vslideup.vi v9, v10, 1
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: ret
%247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
%248 = insertelement <4 x float> poison, float %247, i64 0
Expand Down
78 changes: 30 additions & 48 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3424,14 +3424,11 @@ define <4 x i32> @buildvec_vredsum_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV32-NEXT: vredsum.vs v8, v8, v16
; RV32-NEXT: vredsum.vs v9, v10, v16
; RV32-NEXT: vredsum.vs v10, v12, v16
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vmv.x.s a2, v10
; RV32-NEXT: vredsum.vs v8, v14, v16
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vslide1up.vx v9, v8, a2
; RV32-NEXT: vslide1up.vx v10, v9, a1
; RV32-NEXT: vslide1up.vx v8, v10, a0
; RV32-NEXT: vredsum.vs v11, v14, v16
; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV32-NEXT: vslideup.vi v10, v11, 1
; RV32-NEXT: vslideup.vi v9, v10, 1
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV32-NEXT: ret
;
; RV64V-ONLY-LABEL: buildvec_vredsum_slideup:
Expand All @@ -3441,14 +3438,11 @@ define <4 x i32> @buildvec_vredsum_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV64V-ONLY-NEXT: vredsum.vs v8, v8, v16
; RV64V-ONLY-NEXT: vredsum.vs v9, v10, v16
; RV64V-ONLY-NEXT: vredsum.vs v10, v12, v16
; RV64V-ONLY-NEXT: vmv.x.s a0, v8
; RV64V-ONLY-NEXT: vmv.x.s a1, v9
; RV64V-ONLY-NEXT: vmv.x.s a2, v10
; RV64V-ONLY-NEXT: vredsum.vs v8, v14, v16
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
; RV64V-ONLY-NEXT: vredsum.vs v11, v14, v16
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64V-ONLY-NEXT: vslideup.vi v10, v11, 1
; RV64V-ONLY-NEXT: vslideup.vi v9, v10, 1
; RV64V-ONLY-NEXT: vslideup.vi v8, v9, 1
; RV64V-ONLY-NEXT: ret
;
; RVA22U64-LABEL: buildvec_vredsum_slideup:
Expand Down Expand Up @@ -3498,14 +3492,11 @@ define <4 x i32> @buildvec_vredsum_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV64ZVE32-NEXT: vredsum.vs v8, v8, v16
; RV64ZVE32-NEXT: vredsum.vs v9, v10, v16
; RV64ZVE32-NEXT: vredsum.vs v10, v12, v16
; RV64ZVE32-NEXT: vmv.x.s a0, v8
; RV64ZVE32-NEXT: vmv.x.s a1, v9
; RV64ZVE32-NEXT: vmv.x.s a2, v10
; RV64ZVE32-NEXT: vredsum.vs v8, v14, v16
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
; RV64ZVE32-NEXT: vredsum.vs v11, v14, v16
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32-NEXT: vslideup.vi v10, v11, 1
; RV64ZVE32-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32-NEXT: vslideup.vi v8, v9, 1
; RV64ZVE32-NEXT: ret
%247 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg0)
%248 = insertelement <4 x i32> poison, i32 %247, i64 0
Expand All @@ -3525,14 +3516,11 @@ define <4 x i32> @buildvec_vredmax_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vredmaxu.vs v9, v10, v10
; RV32-NEXT: vredmaxu.vs v10, v12, v12
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vmv.x.s a2, v10
; RV32-NEXT: vredmaxu.vs v8, v14, v14
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vslide1up.vx v9, v8, a2
; RV32-NEXT: vslide1up.vx v10, v9, a1
; RV32-NEXT: vslide1up.vx v8, v10, a0
; RV32-NEXT: vredmaxu.vs v11, v14, v14
; RV32-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV32-NEXT: vslideup.vi v10, v11, 1
; RV32-NEXT: vslideup.vi v9, v10, 1
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV32-NEXT: ret
;
; RV64V-ONLY-LABEL: buildvec_vredmax_slideup:
Expand All @@ -3541,14 +3529,11 @@ define <4 x i32> @buildvec_vredmax_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV64V-ONLY-NEXT: vredmaxu.vs v8, v8, v8
; RV64V-ONLY-NEXT: vredmaxu.vs v9, v10, v10
; RV64V-ONLY-NEXT: vredmaxu.vs v10, v12, v12
; RV64V-ONLY-NEXT: vmv.x.s a0, v8
; RV64V-ONLY-NEXT: vmv.x.s a1, v9
; RV64V-ONLY-NEXT: vmv.x.s a2, v10
; RV64V-ONLY-NEXT: vredmaxu.vs v8, v14, v14
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
; RV64V-ONLY-NEXT: vredmaxu.vs v11, v14, v14
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64V-ONLY-NEXT: vslideup.vi v10, v11, 1
; RV64V-ONLY-NEXT: vslideup.vi v9, v10, 1
; RV64V-ONLY-NEXT: vslideup.vi v8, v9, 1
; RV64V-ONLY-NEXT: ret
;
; RVA22U64-LABEL: buildvec_vredmax_slideup:
Expand Down Expand Up @@ -3595,14 +3580,11 @@ define <4 x i32> @buildvec_vredmax_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8
; RV64ZVE32-NEXT: vredmaxu.vs v8, v8, v8
; RV64ZVE32-NEXT: vredmaxu.vs v9, v10, v10
; RV64ZVE32-NEXT: vredmaxu.vs v10, v12, v12
; RV64ZVE32-NEXT: vmv.x.s a0, v8
; RV64ZVE32-NEXT: vmv.x.s a1, v9
; RV64ZVE32-NEXT: vmv.x.s a2, v10
; RV64ZVE32-NEXT: vredmaxu.vs v8, v14, v14
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
; RV64ZVE32-NEXT: vredmaxu.vs v11, v14, v14
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, tu, ma
; RV64ZVE32-NEXT: vslideup.vi v10, v11, 1
; RV64ZVE32-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32-NEXT: vslideup.vi v8, v9, 1
; RV64ZVE32-NEXT: ret
%247 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg0)
%248 = insertelement <4 x i32> poison, i32 %247, i64 0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ define <2 x float> @redundant_vfmv(<2 x float> %arg0, <64 x float> %arg1, <64 x
; CHECK-NEXT: vfredusum.vs v9, v12, v8
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; CHECK-NEXT: vslidedown.vi v8, v8, 1
; CHECK-NEXT: vfmv.f.s fa5, v9
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
; CHECK-NEXT: vfredusum.vs v9, v16, v8
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vfslide1up.vf v8, v9, fa5
; CHECK-NEXT: vfredusum.vs v8, v16, v8
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v9, v8, 1
; CHECK-NEXT: vmv1r.v v8, v9
; CHECK-NEXT: ret
%s0 = extractelement <2 x float> %arg0, i64 0
%r0 = tail call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s0, <64 x float> %arg1)
Expand Down
Loading