diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4a1db80076530..2149739443650 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4513,41 +4513,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC; + // General case: splat the first operand and slide other operands down one + // by one to form a vector. Alternatively, if every operand is an + // extraction from element 0 of a vector, we use that vector from the last + // extraction as the start value and slide up instead of slide down. Such that + // (1) we can avoid the initial splat (2) we can turn those vslide1up into + // vslideup of 1 later and eliminate the vector to scalar movement, which is + // something we cannot do with vslide1down/vslidedown. + // Of course, using vslide1up/vslideup might increase the register pressure, + // and that's why we conservatively limit to cases where every operand is an + // extraction from the first element. + SmallVector Operands(Op->op_begin(), Op->op_end()); + SDValue EVec; + bool SlideUp = false; + auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec, + SDValue Offset, SDValue Mask, SDValue VL) -> SDValue { + if (SlideUp) + return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset, + Mask, VL, Policy); + return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset, + Mask, VL, Policy); + }; + + // The reason we don't use all_of here is because we're also capturing EVec + // from the last non-undef operand. If the std::execution_policy of the + // underlying std::all_of is anything but std::sequenced_policy we might + // capture the wrong EVec. + for (SDValue V : Operands) { + using namespace SDPatternMatch; + SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero())); + if (!SlideUp) + break; + } + + if (SlideUp) { + MVT EVecContainerVT = EVec.getSimpleValueType(); + // Make sure the original vector has scalable vector type. + if (EVecContainerVT.isFixedLengthVector()) { + EVecContainerVT = + getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget); + EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget); + } + + // Adapt EVec's type into ContainerVT. + if (EVecContainerVT.getVectorMinNumElements() < + ContainerVT.getVectorMinNumElements()) + EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0); + else + EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0); + + // Reverse the elements as we're going to slide up from the last element. + std::reverse(Operands.begin(), Operands.end()); + } + SDValue Vec; UndefCount = 0; - for (SDValue V : Op->ops()) { + for (SDValue V : Operands) { if (V.isUndef()) { UndefCount++; continue; } - // Start our sequence with a TA splat in the hopes that hardware is able to - // recognize there's no dependency on the prior value of our temporary - // register. + // Start our sequence with either a TA splat or extract source in the + // hopes that hardware is able to recognize there's no dependency on the + // prior value of our temporary register. if (!Vec) { - Vec = DAG.getSplatVector(VT, DL, V); - Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + if (SlideUp) { + Vec = EVec; + } else { + Vec = DAG.getSplatVector(VT, DL, V); + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + } + UndefCount = 0; continue; } if (UndefCount) { const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT()); - Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT), - Vec, Offset, Mask, VL, Policy); + Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask, + VL); UndefCount = 0; } - auto OpCode = - VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL; + + unsigned Opcode; + if (VT.isFloatingPoint()) + Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL; + else + Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL; + if (!VT.isFloatingPoint()) V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V); - Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, + Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, V, Mask, VL); } if (UndefCount) { const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT()); - Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT), - Vec, Offset, Mask, VL, Policy); + Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask, + VL); } return convertFromScalableVector(VT, Vec, DAG, Subtarget); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index 3c3e08d387faa..aa3b9abe3a7aa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1828,3 +1828,169 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d %v7 = insertelement <8 x double> %v6, double %e7, i64 7 ret <8 x double> %v7 } + +define <4 x float> @buildvec_vfredusum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredusum_slideup: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 +; CHECK-NEXT: vfredusum.vs v9, v10, v16 +; CHECK-NEXT: vfredusum.vs v10, v12, v16 +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 +; CHECK-NEXT: vfmv.f.s fa3, v10 +; CHECK-NEXT: vfredusum.vs v8, v14, v16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vfslide1up.vf v9, v8, fa3 +; CHECK-NEXT: vfslide1up.vf v10, v9, fa4 +; CHECK-NEXT: vfslide1up.vf v8, v10, fa5 +; CHECK-NEXT: ret + %247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %248 = insertelement <4 x float> poison, float %247, i64 0 + %250 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2) + %251 = insertelement <4 x float> %248, float %250, i64 1 + %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %253 = insertelement <4 x float> %251, float %252, i64 2 + %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %255 = insertelement <4 x float> %253, float %254, i64 3 + ret <4 x float> %255 +} + +define <8 x float> @buildvec_vfredusum_slideup_leading_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredusum_slideup_leading_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v9, v8, v16 +; CHECK-NEXT: vfredusum.vs v10, v10, v16 +; CHECK-NEXT: vfredusum.vs v11, v12, v16 +; CHECK-NEXT: vfredusum.vs v8, v14, v16 +; CHECK-NEXT: vfmv.f.s fa5, v9 +; CHECK-NEXT: vfmv.f.s fa4, v10 +; CHECK-NEXT: vfmv.f.s fa3, v11 +; CHECK-NEXT: vfslide1up.vf v10, v8, fa3 +; CHECK-NEXT: vfslide1up.vf v8, v10, fa4 +; CHECK-NEXT: vfslide1up.vf v10, v8, fa5 +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: ret + %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %253 = insertelement <8 x float> poison, float %252, i64 4 + %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2) + %255 = insertelement <8 x float> %253, float %254, i64 5 + %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %257 = insertelement <8 x float> %255, float %256, i64 6 + %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %259 = insertelement <8 x float> %257, float %258, i64 7 + ret <8 x float> %259 +} + +define <8 x float> @buildvec_vfredusum_slideup_trailing_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredusum_slideup_trailing_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v9, v8, v16 +; CHECK-NEXT: vfredusum.vs v10, v10, v16 +; CHECK-NEXT: vfredusum.vs v11, v12, v16 +; CHECK-NEXT: vfredusum.vs v8, v14, v16 +; CHECK-NEXT: vfmv.f.s fa5, v9 +; CHECK-NEXT: vfmv.f.s fa4, v10 +; CHECK-NEXT: vfmv.f.s fa3, v11 +; CHECK-NEXT: vfslide1up.vf v10, v8, fa3 +; CHECK-NEXT: vfslide1up.vf v12, v10, fa4 +; CHECK-NEXT: vfslide1up.vf v8, v12, fa5 +; CHECK-NEXT: ret + %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %253 = insertelement <8 x float> poison, float %252, i64 0 + %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2) + %255 = insertelement <8 x float> %253, float %254, i64 1 + %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %257 = insertelement <8 x float> %255, float %256, i64 2 + %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %259 = insertelement <8 x float> %257, float %258, i64 3 + ret <8 x float> %259 +} + +; Negative test case checking if we generate slideup only when all build_vec operands are extraction from the first vector element. +define <8 x float> @buildvec_vfredusum_slideup_not_extract_first(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredusum_slideup_not_extract_first: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfredusum.vs v9, v12, v10 +; CHECK-NEXT: vfredusum.vs v10, v14, v10 +; CHECK-NEXT: vfmv.f.s fa5, v9 +; CHECK-NEXT: vfmv.f.s fa4, v10 +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vfslide1down.vf v8, v10, fa0 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1down.vf v8, v8, fa4 +; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: ret + %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %253 = insertelement <8 x float> poison, float %252, i64 0 + %255 = insertelement <8 x float> %253, float %start, i64 1 + %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %257 = insertelement <8 x float> %255, float %256, i64 2 + %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %259 = insertelement <8 x float> %257, float %258, i64 3 + ret <8 x float> %259 +} + +define <8 x float> @buildvec_vfredusum_slideup_mid_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredusum_slideup_mid_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v9, v8, v16 +; CHECK-NEXT: vfredusum.vs v10, v10, v16 +; CHECK-NEXT: vfredusum.vs v11, v12, v16 +; CHECK-NEXT: vfredusum.vs v8, v14, v16 +; CHECK-NEXT: vfmv.f.s fa5, v9 +; CHECK-NEXT: vfmv.f.s fa4, v10 +; CHECK-NEXT: vfmv.f.s fa3, v11 +; CHECK-NEXT: vfslide1up.vf v10, v8, fa3 +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vfslide1up.vf v10, v8, fa4 +; CHECK-NEXT: vfslide1up.vf v8, v10, fa5 +; CHECK-NEXT: ret + %252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %253 = insertelement <8 x float> poison, float %252, i64 0 + %254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2) + %255 = insertelement <8 x float> %253, float %254, i64 1 + %256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %257 = insertelement <8 x float> %255, float %256, i64 6 + %258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %259 = insertelement <8 x float> %257, float %258, i64 7 + ret <8 x float> %259 +} + +define <4 x float> @buildvec_vfredosum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind { +; CHECK-LABEL: buildvec_vfredosum_slideup: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 +; CHECK-NEXT: vfredosum.vs v9, v10, v16 +; CHECK-NEXT: vfredosum.vs v10, v12, v16 +; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: vfmv.f.s fa4, v9 +; CHECK-NEXT: vfmv.f.s fa3, v10 +; CHECK-NEXT: vfredosum.vs v8, v14, v16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vfslide1up.vf v9, v8, fa3 +; CHECK-NEXT: vfslide1up.vf v10, v9, fa4 +; CHECK-NEXT: vfslide1up.vf v8, v10, fa5 +; CHECK-NEXT: ret + %247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1) + %248 = insertelement <4 x float> poison, float %247, i64 0 + %250 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2) + %251 = insertelement <4 x float> %248, float %250, i64 1 + %252 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3) + %253 = insertelement <4 x float> %251, float %252, i64 2 + %254 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4) + %255 = insertelement <4 x float> %253, float %254, i64 3 + ret <4 x float> %255 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index d9bb007a10f71..eedf19c38766b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -3416,5 +3416,204 @@ define <4 x i1> @buildvec_i1_splat(i1 %e1) { ret <4 x i1> %v4 } +define <4 x i32> @buildvec_vredsum_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %arg2, <8 x i32> %arg3) nounwind { +; RV32-LABEL: buildvec_vredsum_slideup: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 +; RV32-NEXT: vredsum.vs v9, v10, v16 +; RV32-NEXT: vredsum.vs v10, v12, v16 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vredsum.vs v8, v14, v16 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vslide1up.vx v9, v8, a2 +; RV32-NEXT: vslide1up.vx v10, v9, a1 +; RV32-NEXT: vslide1up.vx v8, v10, a0 +; RV32-NEXT: ret +; +; RV64V-ONLY-LABEL: buildvec_vredsum_slideup: +; RV64V-ONLY: # %bb.0: +; RV64V-ONLY-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.s.x v16, zero +; RV64V-ONLY-NEXT: vredsum.vs v8, v8, v16 +; RV64V-ONLY-NEXT: vredsum.vs v9, v10, v16 +; RV64V-ONLY-NEXT: vredsum.vs v10, v12, v16 +; RV64V-ONLY-NEXT: vmv.x.s a0, v8 +; RV64V-ONLY-NEXT: vmv.x.s a1, v9 +; RV64V-ONLY-NEXT: vmv.x.s a2, v10 +; RV64V-ONLY-NEXT: vredsum.vs v8, v14, v16 +; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2 +; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1 +; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0 +; RV64V-ONLY-NEXT: ret +; +; RVA22U64-LABEL: buildvec_vredsum_slideup: +; RVA22U64: # %bb.0: +; RVA22U64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-NEXT: vmv.s.x v16, zero +; RVA22U64-NEXT: vredsum.vs v8, v8, v16 +; RVA22U64-NEXT: vredsum.vs v9, v10, v16 +; RVA22U64-NEXT: vredsum.vs v10, v12, v16 +; RVA22U64-NEXT: vredsum.vs v11, v14, v16 +; RVA22U64-NEXT: vmv.x.s a0, v8 +; RVA22U64-NEXT: vmv.x.s a1, v9 +; RVA22U64-NEXT: vmv.x.s a2, v10 +; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: add.uw a0, a0, a1 +; RVA22U64-NEXT: vmv.x.s a1, v11 +; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: add.uw a1, a2, a1 +; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVA22U64-NEXT: vmv.v.x v8, a0 +; RVA22U64-NEXT: vslide1down.vx v8, v8, a1 +; RVA22U64-NEXT: ret +; +; RVA22U64-PACK-LABEL: buildvec_vredsum_slideup: +; RVA22U64-PACK: # %bb.0: +; RVA22U64-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-PACK-NEXT: vmv.s.x v16, zero +; RVA22U64-PACK-NEXT: vredsum.vs v8, v8, v16 +; RVA22U64-PACK-NEXT: vredsum.vs v9, v10, v16 +; RVA22U64-PACK-NEXT: vredsum.vs v10, v12, v16 +; RVA22U64-PACK-NEXT: vredsum.vs v11, v14, v16 +; RVA22U64-PACK-NEXT: vmv.x.s a0, v8 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v9 +; RVA22U64-PACK-NEXT: vmv.x.s a2, v10 +; RVA22U64-PACK-NEXT: pack a0, a0, a1 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v11 +; RVA22U64-PACK-NEXT: pack a1, a2, a1 +; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVA22U64-PACK-NEXT: vmv.v.x v8, a0 +; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a1 +; RVA22U64-PACK-NEXT: ret +; +; RV64ZVE32-LABEL: buildvec_vredsum_slideup: +; RV64ZVE32: # %bb.0: +; RV64ZVE32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32-NEXT: vmv.s.x v16, zero +; RV64ZVE32-NEXT: vredsum.vs v8, v8, v16 +; RV64ZVE32-NEXT: vredsum.vs v9, v10, v16 +; RV64ZVE32-NEXT: vredsum.vs v10, v12, v16 +; RV64ZVE32-NEXT: vmv.x.s a0, v8 +; RV64ZVE32-NEXT: vmv.x.s a1, v9 +; RV64ZVE32-NEXT: vmv.x.s a2, v10 +; RV64ZVE32-NEXT: vredsum.vs v8, v14, v16 +; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2 +; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1 +; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0 +; RV64ZVE32-NEXT: ret + %247 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg0) + %248 = insertelement <4 x i32> poison, i32 %247, i64 0 + %250 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg1) + %251 = insertelement <4 x i32> %248, i32 %250, i64 1 + %252 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg2) + %253 = insertelement <4 x i32> %251, i32 %252, i64 2 + %254 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg3) + %255 = insertelement <4 x i32> %253, i32 %254, i64 3 + ret <4 x i32> %255 +} + +define <4 x i32> @buildvec_vredmax_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %arg2, <8 x i32> %arg3) nounwind { +; RV32-LABEL: buildvec_vredmax_slideup: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vredmaxu.vs v8, v8, v8 +; RV32-NEXT: vredmaxu.vs v9, v10, v10 +; RV32-NEXT: vredmaxu.vs v10, v12, v12 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vredmaxu.vs v8, v14, v14 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vslide1up.vx v9, v8, a2 +; RV32-NEXT: vslide1up.vx v10, v9, a1 +; RV32-NEXT: vslide1up.vx v8, v10, a0 +; RV32-NEXT: ret +; +; RV64V-ONLY-LABEL: buildvec_vredmax_slideup: +; RV64V-ONLY: # %bb.0: +; RV64V-ONLY-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-ONLY-NEXT: vredmaxu.vs v8, v8, v8 +; RV64V-ONLY-NEXT: vredmaxu.vs v9, v10, v10 +; RV64V-ONLY-NEXT: vredmaxu.vs v10, v12, v12 +; RV64V-ONLY-NEXT: vmv.x.s a0, v8 +; RV64V-ONLY-NEXT: vmv.x.s a1, v9 +; RV64V-ONLY-NEXT: vmv.x.s a2, v10 +; RV64V-ONLY-NEXT: vredmaxu.vs v8, v14, v14 +; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2 +; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1 +; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0 +; RV64V-ONLY-NEXT: ret +; +; RVA22U64-LABEL: buildvec_vredmax_slideup: +; RVA22U64: # %bb.0: +; RVA22U64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-NEXT: vredmaxu.vs v8, v8, v8 +; RVA22U64-NEXT: vredmaxu.vs v9, v10, v10 +; RVA22U64-NEXT: vredmaxu.vs v10, v12, v12 +; RVA22U64-NEXT: vredmaxu.vs v11, v14, v14 +; RVA22U64-NEXT: vmv.x.s a0, v8 +; RVA22U64-NEXT: vmv.x.s a1, v9 +; RVA22U64-NEXT: vmv.x.s a2, v10 +; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: add.uw a0, a0, a1 +; RVA22U64-NEXT: vmv.x.s a1, v11 +; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: add.uw a1, a2, a1 +; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVA22U64-NEXT: vmv.v.x v8, a0 +; RVA22U64-NEXT: vslide1down.vx v8, v8, a1 +; RVA22U64-NEXT: ret +; +; RVA22U64-PACK-LABEL: buildvec_vredmax_slideup: +; RVA22U64-PACK: # %bb.0: +; RVA22U64-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-PACK-NEXT: vredmaxu.vs v8, v8, v8 +; RVA22U64-PACK-NEXT: vredmaxu.vs v9, v10, v10 +; RVA22U64-PACK-NEXT: vredmaxu.vs v10, v12, v12 +; RVA22U64-PACK-NEXT: vredmaxu.vs v11, v14, v14 +; RVA22U64-PACK-NEXT: vmv.x.s a0, v8 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v9 +; RVA22U64-PACK-NEXT: vmv.x.s a2, v10 +; RVA22U64-PACK-NEXT: pack a0, a0, a1 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v11 +; RVA22U64-PACK-NEXT: pack a1, a2, a1 +; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RVA22U64-PACK-NEXT: vmv.v.x v8, a0 +; RVA22U64-PACK-NEXT: vslide1down.vx v8, v8, a1 +; RVA22U64-PACK-NEXT: ret +; +; RV64ZVE32-LABEL: buildvec_vredmax_slideup: +; RV64ZVE32: # %bb.0: +; RV64ZVE32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64ZVE32-NEXT: vredmaxu.vs v8, v8, v8 +; RV64ZVE32-NEXT: vredmaxu.vs v9, v10, v10 +; RV64ZVE32-NEXT: vredmaxu.vs v10, v12, v12 +; RV64ZVE32-NEXT: vmv.x.s a0, v8 +; RV64ZVE32-NEXT: vmv.x.s a1, v9 +; RV64ZVE32-NEXT: vmv.x.s a2, v10 +; RV64ZVE32-NEXT: vredmaxu.vs v8, v14, v14 +; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2 +; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1 +; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0 +; RV64ZVE32-NEXT: ret + %247 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg0) + %248 = insertelement <4 x i32> poison, i32 %247, i64 0 + %250 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg1) + %251 = insertelement <4 x i32> %248, i32 %250, i64 1 + %252 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg2) + %253 = insertelement <4 x i32> %251, i32 %252, i64 2 + %254 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg3) + %255 = insertelement <4 x i32> %253, i32 %254, i64 3 + ret <4 x i32> %255 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll b/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll index da912bf401ec0..821d4240827fb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll +++ b/llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll @@ -9,12 +9,11 @@ define <2 x float> @redundant_vfmv(<2 x float> %arg0, <64 x float> %arg1, <64 x ; CHECK-NEXT: vfredusum.vs v9, v12, v8 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vfmv.f.s fa5, v9 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vfredusum.vs v8, v16, v8 -; CHECK-NEXT: vfmv.f.s fa5, v8 +; CHECK-NEXT: vfredusum.vs v9, v16, v8 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vrgather.vi v8, v9, 0 -; CHECK-NEXT: vfslide1down.vf v8, v8, fa5 +; CHECK-NEXT: vfslide1up.vf v8, v9, fa5 ; CHECK-NEXT: ret %s0 = extractelement <2 x float> %arg0, i64 0 %r0 = tail call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s0, <64 x float> %arg1)