Skip to content

Commit c5b56c2

Browse files
committed
[RISCV] Use slideup when the last build_vector operand is a reduction
1 parent f74a607 commit c5b56c2

File tree

4 files changed

+114
-68
lines changed

4 files changed

+114
-68
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 73 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4512,42 +4512,103 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
45124512
"Illegal type which will result in reserved encoding");
45134513

45144514
const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
4515+
auto getVSlide = [&](bool SlideUp, EVT ContainerVT, SDValue Passthru,
4516+
SDValue Vec, SDValue Offset, SDValue Mask,
4517+
SDValue VL) -> SDValue {
4518+
if (SlideUp)
4519+
return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
4520+
Mask, VL, Policy);
4521+
return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
4522+
Mask, VL, Policy);
4523+
};
4524+
4525+
// General case: splat the first operand and sliding other operands down one
4526+
// by one to form a vector. Alternatively, if the last operand is an
4527+
// extraction from a reduction result, we can use the original vector
4528+
// reduction result as the start value and slide up instead of slide down.
4529+
// Such that we can avoid the splat.
4530+
SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
4531+
SDValue Reduce;
4532+
bool SlideUp = false;
4533+
// Find the first first non-undef from the tail.
4534+
auto ItLastNonUndef = find_if(Operands.rbegin(), Operands.rend(),
4535+
[](SDValue V) { return !V.isUndef(); });
4536+
if (ItLastNonUndef != Operands.rend()) {
4537+
using namespace SDPatternMatch;
4538+
// Check if the last non-undef operand was extracted from a reduction.
4539+
for (unsigned Opc :
4540+
{RISCVISD::VECREDUCE_ADD_VL, RISCVISD::VECREDUCE_UMAX_VL,
4541+
RISCVISD::VECREDUCE_SMAX_VL, RISCVISD::VECREDUCE_UMIN_VL,
4542+
RISCVISD::VECREDUCE_SMIN_VL, RISCVISD::VECREDUCE_AND_VL,
4543+
RISCVISD::VECREDUCE_OR_VL, RISCVISD::VECREDUCE_XOR_VL,
4544+
RISCVISD::VECREDUCE_FADD_VL, RISCVISD::VECREDUCE_SEQ_FADD_VL,
4545+
RISCVISD::VECREDUCE_FMAX_VL, RISCVISD::VECREDUCE_FMIN_VL}) {
4546+
SlideUp = sd_match(
4547+
*ItLastNonUndef,
4548+
m_ExtractElt(m_AllOf(m_Opc(Opc), m_Value(Reduce)), m_Zero()));
4549+
if (SlideUp)
4550+
break;
4551+
}
4552+
}
4553+
4554+
if (SlideUp) {
4555+
// Adapt Reduce's type into ContainerVT.
4556+
if (Reduce.getValueType().getVectorMinNumElements() <
4557+
ContainerVT.getVectorMinNumElements())
4558+
Reduce = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), Reduce, 0);
4559+
else
4560+
Reduce = DAG.getExtractSubvector(DL, ContainerVT, Reduce, 0);
4561+
4562+
// Reverse the elements as we're going to slide up from the last element.
4563+
for (unsigned i = 0U, N = Operands.size(), H = divideCeil(N, 2); i < H; ++i)
4564+
std::swap(Operands[i], Operands[N - 1 - i]);
4565+
}
45154566

45164567
SDValue Vec;
45174568
UndefCount = 0;
4518-
for (SDValue V : Op->ops()) {
4569+
for (SDValue V : Operands) {
45194570
if (V.isUndef()) {
45204571
UndefCount++;
45214572
continue;
45224573
}
45234574

4524-
// Start our sequence with a TA splat in the hopes that hardware is able to
4525-
// recognize there's no dependency on the prior value of our temporary
4526-
// register.
4575+
// Start our sequence with either a TA splat or a reduction result in the
4576+
// hopes that hardware is able to recognize there's no dependency on the
4577+
// prior value of our temporary register.
45274578
if (!Vec) {
4528-
Vec = DAG.getSplatVector(VT, DL, V);
4529-
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
4579+
if (SlideUp) {
4580+
Vec = Reduce;
4581+
} else {
4582+
Vec = DAG.getSplatVector(VT, DL, V);
4583+
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
4584+
}
4585+
45304586
UndefCount = 0;
45314587
continue;
45324588
}
45334589

45344590
if (UndefCount) {
45354591
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4536-
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4537-
Vec, Offset, Mask, VL, Policy);
4592+
Vec = getVSlide(SlideUp, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
4593+
Offset, Mask, VL);
45384594
UndefCount = 0;
45394595
}
4540-
auto OpCode =
4541-
VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
4596+
4597+
unsigned OpCode;
4598+
if (VT.isFloatingPoint())
4599+
OpCode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
4600+
else
4601+
OpCode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
4602+
45424603
if (!VT.isFloatingPoint())
45434604
V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
45444605
Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
45454606
V, Mask, VL);
45464607
}
45474608
if (UndefCount) {
45484609
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4549-
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4550-
Vec, Offset, Mask, VL, Policy);
4610+
Vec = getVSlide(SlideUp, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
4611+
Offset, Mask, VL);
45514612
}
45524613
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
45534614
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1890,15 +1890,14 @@ define <4 x float> @buildvec_vfredusum(float %start, <8 x float> %arg1, <8 x flo
18901890
; CHECK-NEXT: vfredusum.vs v8, v8, v16
18911891
; CHECK-NEXT: vfredusum.vs v9, v10, v16
18921892
; CHECK-NEXT: vfredusum.vs v10, v12, v16
1893-
; CHECK-NEXT: vfredusum.vs v11, v14, v16
1894-
; CHECK-NEXT: vfmv.f.s fa5, v9
1895-
; CHECK-NEXT: vfmv.f.s fa4, v10
1896-
; CHECK-NEXT: vfmv.f.s fa3, v11
1893+
; CHECK-NEXT: vfmv.f.s fa5, v8
1894+
; CHECK-NEXT: vfmv.f.s fa4, v9
1895+
; CHECK-NEXT: vfmv.f.s fa3, v10
1896+
; CHECK-NEXT: vfredusum.vs v8, v14, v16
18971897
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1898-
; CHECK-NEXT: vrgather.vi v9, v8, 0
1899-
; CHECK-NEXT: vfslide1down.vf v8, v9, fa5
1900-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
1901-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3
1898+
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
1899+
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
1900+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
19021901
; CHECK-NEXT: ret
19031902
%247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
19041903
%248 = insertelement <4 x float> poison, float %247, i64 0
@@ -1919,15 +1918,14 @@ define <4 x float> @buildvec_vfredosum(float %start, <8 x float> %arg1, <8 x flo
19191918
; CHECK-NEXT: vfredosum.vs v8, v8, v16
19201919
; CHECK-NEXT: vfredosum.vs v9, v10, v16
19211920
; CHECK-NEXT: vfredosum.vs v10, v12, v16
1922-
; CHECK-NEXT: vfredosum.vs v11, v14, v16
1923-
; CHECK-NEXT: vfmv.f.s fa5, v9
1924-
; CHECK-NEXT: vfmv.f.s fa4, v10
1925-
; CHECK-NEXT: vfmv.f.s fa3, v11
1921+
; CHECK-NEXT: vfmv.f.s fa5, v8
1922+
; CHECK-NEXT: vfmv.f.s fa4, v9
1923+
; CHECK-NEXT: vfmv.f.s fa3, v10
1924+
; CHECK-NEXT: vfredosum.vs v8, v14, v16
19261925
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1927-
; CHECK-NEXT: vrgather.vi v9, v8, 0
1928-
; CHECK-NEXT: vfslide1down.vf v8, v9, fa5
1929-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
1930-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3
1926+
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
1927+
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
1928+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
19311929
; CHECK-NEXT: ret
19321930
%247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
19331931
%248 = insertelement <4 x float> poison, float %247, i64 0

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll

Lines changed: 24 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3424,16 +3424,14 @@ define <4 x i32> @buildvec_vredsum(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
34243424
; RV32-NEXT: vredsum.vs v8, v8, v16
34253425
; RV32-NEXT: vredsum.vs v9, v10, v16
34263426
; RV32-NEXT: vredsum.vs v10, v12, v16
3427-
; RV32-NEXT: vredsum.vs v11, v14, v16
34283427
; RV32-NEXT: vmv.x.s a0, v8
34293428
; RV32-NEXT: vmv.x.s a1, v9
34303429
; RV32-NEXT: vmv.x.s a2, v10
3431-
; RV32-NEXT: vmv.x.s a3, v11
3430+
; RV32-NEXT: vredsum.vs v8, v14, v16
34323431
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3433-
; RV32-NEXT: vmv.v.x v8, a0
3434-
; RV32-NEXT: vslide1down.vx v8, v8, a1
3435-
; RV32-NEXT: vslide1down.vx v8, v8, a2
3436-
; RV32-NEXT: vslide1down.vx v8, v8, a3
3432+
; RV32-NEXT: vslide1up.vx v9, v8, a2
3433+
; RV32-NEXT: vslide1up.vx v10, v9, a1
3434+
; RV32-NEXT: vslide1up.vx v8, v10, a0
34373435
; RV32-NEXT: ret
34383436
;
34393437
; RV64V-ONLY-LABEL: buildvec_vredsum:
@@ -3443,16 +3441,14 @@ define <4 x i32> @buildvec_vredsum(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
34433441
; RV64V-ONLY-NEXT: vredsum.vs v8, v8, v16
34443442
; RV64V-ONLY-NEXT: vredsum.vs v9, v10, v16
34453443
; RV64V-ONLY-NEXT: vredsum.vs v10, v12, v16
3446-
; RV64V-ONLY-NEXT: vredsum.vs v11, v14, v16
34473444
; RV64V-ONLY-NEXT: vmv.x.s a0, v8
34483445
; RV64V-ONLY-NEXT: vmv.x.s a1, v9
34493446
; RV64V-ONLY-NEXT: vmv.x.s a2, v10
3450-
; RV64V-ONLY-NEXT: vmv.x.s a3, v11
3447+
; RV64V-ONLY-NEXT: vredsum.vs v8, v14, v16
34513448
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3452-
; RV64V-ONLY-NEXT: vmv.v.x v8, a0
3453-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1
3454-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2
3455-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3
3449+
; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
3450+
; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
3451+
; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
34563452
; RV64V-ONLY-NEXT: ret
34573453
;
34583454
; RVA22U64-LABEL: buildvec_vredsum:
@@ -3502,16 +3498,14 @@ define <4 x i32> @buildvec_vredsum(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
35023498
; RV64ZVE32-NEXT: vredsum.vs v8, v8, v16
35033499
; RV64ZVE32-NEXT: vredsum.vs v9, v10, v16
35043500
; RV64ZVE32-NEXT: vredsum.vs v10, v12, v16
3505-
; RV64ZVE32-NEXT: vredsum.vs v11, v14, v16
35063501
; RV64ZVE32-NEXT: vmv.x.s a0, v8
35073502
; RV64ZVE32-NEXT: vmv.x.s a1, v9
35083503
; RV64ZVE32-NEXT: vmv.x.s a2, v10
3509-
; RV64ZVE32-NEXT: vmv.x.s a3, v11
3504+
; RV64ZVE32-NEXT: vredsum.vs v8, v14, v16
35103505
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3511-
; RV64ZVE32-NEXT: vmv.v.x v8, a0
3512-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1
3513-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2
3514-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3
3506+
; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
3507+
; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
3508+
; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
35153509
; RV64ZVE32-NEXT: ret
35163510
%247 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %arg0)
35173511
%248 = insertelement <4 x i32> poison, i32 %247, i64 0
@@ -3531,16 +3525,14 @@ define <4 x i32> @buildvec_vredmax(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
35313525
; RV32-NEXT: vredmaxu.vs v8, v8, v8
35323526
; RV32-NEXT: vredmaxu.vs v9, v10, v10
35333527
; RV32-NEXT: vredmaxu.vs v10, v12, v12
3534-
; RV32-NEXT: vredmaxu.vs v11, v14, v14
35353528
; RV32-NEXT: vmv.x.s a0, v8
35363529
; RV32-NEXT: vmv.x.s a1, v9
35373530
; RV32-NEXT: vmv.x.s a2, v10
3538-
; RV32-NEXT: vmv.x.s a3, v11
3531+
; RV32-NEXT: vredmaxu.vs v8, v14, v14
35393532
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3540-
; RV32-NEXT: vmv.v.x v8, a0
3541-
; RV32-NEXT: vslide1down.vx v8, v8, a1
3542-
; RV32-NEXT: vslide1down.vx v8, v8, a2
3543-
; RV32-NEXT: vslide1down.vx v8, v8, a3
3533+
; RV32-NEXT: vslide1up.vx v9, v8, a2
3534+
; RV32-NEXT: vslide1up.vx v10, v9, a1
3535+
; RV32-NEXT: vslide1up.vx v8, v10, a0
35443536
; RV32-NEXT: ret
35453537
;
35463538
; RV64V-ONLY-LABEL: buildvec_vredmax:
@@ -3549,16 +3541,14 @@ define <4 x i32> @buildvec_vredmax(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
35493541
; RV64V-ONLY-NEXT: vredmaxu.vs v8, v8, v8
35503542
; RV64V-ONLY-NEXT: vredmaxu.vs v9, v10, v10
35513543
; RV64V-ONLY-NEXT: vredmaxu.vs v10, v12, v12
3552-
; RV64V-ONLY-NEXT: vredmaxu.vs v11, v14, v14
35533544
; RV64V-ONLY-NEXT: vmv.x.s a0, v8
35543545
; RV64V-ONLY-NEXT: vmv.x.s a1, v9
35553546
; RV64V-ONLY-NEXT: vmv.x.s a2, v10
3556-
; RV64V-ONLY-NEXT: vmv.x.s a3, v11
3547+
; RV64V-ONLY-NEXT: vredmaxu.vs v8, v14, v14
35573548
; RV64V-ONLY-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3558-
; RV64V-ONLY-NEXT: vmv.v.x v8, a0
3559-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1
3560-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2
3561-
; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3
3549+
; RV64V-ONLY-NEXT: vslide1up.vx v9, v8, a2
3550+
; RV64V-ONLY-NEXT: vslide1up.vx v10, v9, a1
3551+
; RV64V-ONLY-NEXT: vslide1up.vx v8, v10, a0
35623552
; RV64V-ONLY-NEXT: ret
35633553
;
35643554
; RVA22U64-LABEL: buildvec_vredmax:
@@ -3605,16 +3595,14 @@ define <4 x i32> @buildvec_vredmax(<8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> %
36053595
; RV64ZVE32-NEXT: vredmaxu.vs v8, v8, v8
36063596
; RV64ZVE32-NEXT: vredmaxu.vs v9, v10, v10
36073597
; RV64ZVE32-NEXT: vredmaxu.vs v10, v12, v12
3608-
; RV64ZVE32-NEXT: vredmaxu.vs v11, v14, v14
36093598
; RV64ZVE32-NEXT: vmv.x.s a0, v8
36103599
; RV64ZVE32-NEXT: vmv.x.s a1, v9
36113600
; RV64ZVE32-NEXT: vmv.x.s a2, v10
3612-
; RV64ZVE32-NEXT: vmv.x.s a3, v11
3601+
; RV64ZVE32-NEXT: vredmaxu.vs v8, v14, v14
36133602
; RV64ZVE32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
3614-
; RV64ZVE32-NEXT: vmv.v.x v8, a0
3615-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1
3616-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2
3617-
; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3
3603+
; RV64ZVE32-NEXT: vslide1up.vx v9, v8, a2
3604+
; RV64ZVE32-NEXT: vslide1up.vx v10, v9, a1
3605+
; RV64ZVE32-NEXT: vslide1up.vx v8, v10, a0
36183606
; RV64ZVE32-NEXT: ret
36193607
%247 = tail call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %arg0)
36203608
%248 = insertelement <4 x i32> poison, i32 %247, i64 0

llvm/test/CodeGen/RISCV/rvv/redundant-vfmvsf.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,11 @@ define <2 x float> @redundant_vfmv(<2 x float> %arg0, <64 x float> %arg1, <64 x
99
; CHECK-NEXT: vfredusum.vs v9, v12, v8
1010
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
1111
; CHECK-NEXT: vslidedown.vi v8, v8, 1
12+
; CHECK-NEXT: vfmv.f.s fa5, v9
1213
; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma
13-
; CHECK-NEXT: vfredusum.vs v8, v16, v8
14-
; CHECK-NEXT: vfmv.f.s fa5, v8
14+
; CHECK-NEXT: vfredusum.vs v9, v16, v8
1515
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
16-
; CHECK-NEXT: vrgather.vi v8, v9, 0
17-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
16+
; CHECK-NEXT: vfslide1up.vf v8, v9, fa5
1817
; CHECK-NEXT: ret
1918
%s0 = extractelement <2 x float> %arg0, i64 0
2019
%r0 = tail call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s0, <64 x float> %arg1)

0 commit comments

Comments
 (0)