Skip to content

Commit 6c3db64

Browse files
mshockwavetopperclukel97
authored
[RISCV] Use slideup to lower build_vector when all operand are (extract_element X, 0) (#154450)
The general lowering of build_vector starts with splatting the first operand before sliding down other operands one-by-one. However, if the every operands is an extract_element from the first vector element, we could use the original _vector_ (source of extraction) from the last build_vec operand as start value before sliding up other operands (in reverse order) one-by-one. By doing so we can avoid the initial splat and eliminate the vector to scalar movement later, which is something we cannot do with vslidedown/vslide1down. --------- Co-authored-by: Craig Topper <[email protected]> Co-authored-by: Luke Lau <[email protected]>
1 parent 3e5f49a commit 6c3db64

File tree

4 files changed

+444
-17
lines changed

4 files changed

+444
-17
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4518,41 +4518,104 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
45184518

45194519
const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
45204520

4521+
// General case: splat the first operand and slide other operands down one
4522+
// by one to form a vector. Alternatively, if every operand is an
4523+
// extraction from element 0 of a vector, we use that vector from the last
4524+
// extraction as the start value and slide up instead of slide down. Such that
4525+
// (1) we can avoid the initial splat (2) we can turn those vslide1up into
4526+
// vslideup of 1 later and eliminate the vector to scalar movement, which is
4527+
// something we cannot do with vslide1down/vslidedown.
4528+
// Of course, using vslide1up/vslideup might increase the register pressure,
4529+
// and that's why we conservatively limit to cases where every operand is an
4530+
// extraction from the first element.
4531+
SmallVector<SDValue> Operands(Op->op_begin(), Op->op_end());
4532+
SDValue EVec;
4533+
bool SlideUp = false;
4534+
auto getVSlide = [&](EVT ContainerVT, SDValue Passthru, SDValue Vec,
4535+
SDValue Offset, SDValue Mask, SDValue VL) -> SDValue {
4536+
if (SlideUp)
4537+
return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
4538+
Mask, VL, Policy);
4539+
return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, Vec, Offset,
4540+
Mask, VL, Policy);
4541+
};
4542+
4543+
// The reason we don't use all_of here is because we're also capturing EVec
4544+
// from the last non-undef operand. If the std::execution_policy of the
4545+
// underlying std::all_of is anything but std::sequenced_policy we might
4546+
// capture the wrong EVec.
4547+
for (SDValue V : Operands) {
4548+
using namespace SDPatternMatch;
4549+
SlideUp = V.isUndef() || sd_match(V, m_ExtractElt(m_Value(EVec), m_Zero()));
4550+
if (!SlideUp)
4551+
break;
4552+
}
4553+
4554+
if (SlideUp) {
4555+
MVT EVecContainerVT = EVec.getSimpleValueType();
4556+
// Make sure the original vector has scalable vector type.
4557+
if (EVecContainerVT.isFixedLengthVector()) {
4558+
EVecContainerVT =
4559+
getContainerForFixedLengthVector(DAG, EVecContainerVT, Subtarget);
4560+
EVec = convertToScalableVector(EVecContainerVT, EVec, DAG, Subtarget);
4561+
}
4562+
4563+
// Adapt EVec's type into ContainerVT.
4564+
if (EVecContainerVT.getVectorMinNumElements() <
4565+
ContainerVT.getVectorMinNumElements())
4566+
EVec = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), EVec, 0);
4567+
else
4568+
EVec = DAG.getExtractSubvector(DL, ContainerVT, EVec, 0);
4569+
4570+
// Reverse the elements as we're going to slide up from the last element.
4571+
std::reverse(Operands.begin(), Operands.end());
4572+
}
4573+
45214574
SDValue Vec;
45224575
UndefCount = 0;
4523-
for (SDValue V : Op->ops()) {
4576+
for (SDValue V : Operands) {
45244577
if (V.isUndef()) {
45254578
UndefCount++;
45264579
continue;
45274580
}
45284581

4529-
// Start our sequence with a TA splat in the hopes that hardware is able to
4530-
// recognize there's no dependency on the prior value of our temporary
4531-
// register.
4582+
// Start our sequence with either a TA splat or extract source in the
4583+
// hopes that hardware is able to recognize there's no dependency on the
4584+
// prior value of our temporary register.
45324585
if (!Vec) {
4533-
Vec = DAG.getSplatVector(VT, DL, V);
4534-
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
4586+
if (SlideUp) {
4587+
Vec = EVec;
4588+
} else {
4589+
Vec = DAG.getSplatVector(VT, DL, V);
4590+
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
4591+
}
4592+
45354593
UndefCount = 0;
45364594
continue;
45374595
}
45384596

45394597
if (UndefCount) {
45404598
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4541-
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4542-
Vec, Offset, Mask, VL, Policy);
4599+
Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
4600+
VL);
45434601
UndefCount = 0;
45444602
}
4545-
auto OpCode =
4546-
VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
4603+
4604+
unsigned Opcode;
4605+
if (VT.isFloatingPoint())
4606+
Opcode = SlideUp ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VFSLIDE1DOWN_VL;
4607+
else
4608+
Opcode = SlideUp ? RISCVISD::VSLIDE1UP_VL : RISCVISD::VSLIDE1DOWN_VL;
4609+
45474610
if (!VT.isFloatingPoint())
45484611
V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
4549-
Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
4612+
Vec = DAG.getNode(Opcode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
45504613
V, Mask, VL);
45514614
}
45524615
if (UndefCount) {
45534616
const SDValue Offset = DAG.getConstant(UndefCount, DL, Subtarget.getXLenVT());
4554-
Vec = getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
4555-
Vec, Offset, Mask, VL, Policy);
4617+
Vec = getVSlide(ContainerVT, DAG.getUNDEF(ContainerVT), Vec, Offset, Mask,
4618+
VL);
45564619
}
45574620
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
45584621
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,3 +1828,169 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
18281828
%v7 = insertelement <8 x double> %v6, double %e7, i64 7
18291829
ret <8 x double> %v7
18301830
}
1831+
1832+
define <4 x float> @buildvec_vfredusum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1833+
; CHECK-LABEL: buildvec_vfredusum_slideup:
1834+
; CHECK: # %bb.0:
1835+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1836+
; CHECK-NEXT: vfmv.s.f v16, fa0
1837+
; CHECK-NEXT: vfredusum.vs v8, v8, v16
1838+
; CHECK-NEXT: vfredusum.vs v9, v10, v16
1839+
; CHECK-NEXT: vfredusum.vs v10, v12, v16
1840+
; CHECK-NEXT: vfmv.f.s fa5, v8
1841+
; CHECK-NEXT: vfmv.f.s fa4, v9
1842+
; CHECK-NEXT: vfmv.f.s fa3, v10
1843+
; CHECK-NEXT: vfredusum.vs v8, v14, v16
1844+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1845+
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
1846+
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
1847+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
1848+
; CHECK-NEXT: ret
1849+
%247 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1850+
%248 = insertelement <4 x float> poison, float %247, i64 0
1851+
%250 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
1852+
%251 = insertelement <4 x float> %248, float %250, i64 1
1853+
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1854+
%253 = insertelement <4 x float> %251, float %252, i64 2
1855+
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1856+
%255 = insertelement <4 x float> %253, float %254, i64 3
1857+
ret <4 x float> %255
1858+
}
1859+
1860+
define <8 x float> @buildvec_vfredusum_slideup_leading_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1861+
; CHECK-LABEL: buildvec_vfredusum_slideup_leading_undef:
1862+
; CHECK: # %bb.0:
1863+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1864+
; CHECK-NEXT: vfmv.s.f v16, fa0
1865+
; CHECK-NEXT: vfredusum.vs v9, v8, v16
1866+
; CHECK-NEXT: vfredusum.vs v10, v10, v16
1867+
; CHECK-NEXT: vfredusum.vs v11, v12, v16
1868+
; CHECK-NEXT: vfredusum.vs v8, v14, v16
1869+
; CHECK-NEXT: vfmv.f.s fa5, v9
1870+
; CHECK-NEXT: vfmv.f.s fa4, v10
1871+
; CHECK-NEXT: vfmv.f.s fa3, v11
1872+
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
1873+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa4
1874+
; CHECK-NEXT: vfslide1up.vf v10, v8, fa5
1875+
; CHECK-NEXT: vslideup.vi v8, v10, 4
1876+
; CHECK-NEXT: ret
1877+
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1878+
%253 = insertelement <8 x float> poison, float %252, i64 4
1879+
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
1880+
%255 = insertelement <8 x float> %253, float %254, i64 5
1881+
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1882+
%257 = insertelement <8 x float> %255, float %256, i64 6
1883+
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1884+
%259 = insertelement <8 x float> %257, float %258, i64 7
1885+
ret <8 x float> %259
1886+
}
1887+
1888+
define <8 x float> @buildvec_vfredusum_slideup_trailing_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1889+
; CHECK-LABEL: buildvec_vfredusum_slideup_trailing_undef:
1890+
; CHECK: # %bb.0:
1891+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1892+
; CHECK-NEXT: vfmv.s.f v16, fa0
1893+
; CHECK-NEXT: vfredusum.vs v9, v8, v16
1894+
; CHECK-NEXT: vfredusum.vs v10, v10, v16
1895+
; CHECK-NEXT: vfredusum.vs v11, v12, v16
1896+
; CHECK-NEXT: vfredusum.vs v8, v14, v16
1897+
; CHECK-NEXT: vfmv.f.s fa5, v9
1898+
; CHECK-NEXT: vfmv.f.s fa4, v10
1899+
; CHECK-NEXT: vfmv.f.s fa3, v11
1900+
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
1901+
; CHECK-NEXT: vfslide1up.vf v12, v10, fa4
1902+
; CHECK-NEXT: vfslide1up.vf v8, v12, fa5
1903+
; CHECK-NEXT: ret
1904+
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1905+
%253 = insertelement <8 x float> poison, float %252, i64 0
1906+
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
1907+
%255 = insertelement <8 x float> %253, float %254, i64 1
1908+
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1909+
%257 = insertelement <8 x float> %255, float %256, i64 2
1910+
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1911+
%259 = insertelement <8 x float> %257, float %258, i64 3
1912+
ret <8 x float> %259
1913+
}
1914+
1915+
; Negative test case checking if we generate slideup only when all build_vec operands are extraction from the first vector element.
1916+
define <8 x float> @buildvec_vfredusum_slideup_not_extract_first(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1917+
; CHECK-LABEL: buildvec_vfredusum_slideup_not_extract_first:
1918+
; CHECK: # %bb.0:
1919+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1920+
; CHECK-NEXT: vfmv.s.f v10, fa0
1921+
; CHECK-NEXT: vfredusum.vs v8, v8, v10
1922+
; CHECK-NEXT: vfredusum.vs v9, v12, v10
1923+
; CHECK-NEXT: vfredusum.vs v10, v14, v10
1924+
; CHECK-NEXT: vfmv.f.s fa5, v9
1925+
; CHECK-NEXT: vfmv.f.s fa4, v10
1926+
; CHECK-NEXT: vrgather.vi v10, v8, 0
1927+
; CHECK-NEXT: vfslide1down.vf v8, v10, fa0
1928+
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
1929+
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
1930+
; CHECK-NEXT: vslidedown.vi v8, v8, 4
1931+
; CHECK-NEXT: ret
1932+
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1933+
%253 = insertelement <8 x float> poison, float %252, i64 0
1934+
%255 = insertelement <8 x float> %253, float %start, i64 1
1935+
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1936+
%257 = insertelement <8 x float> %255, float %256, i64 2
1937+
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1938+
%259 = insertelement <8 x float> %257, float %258, i64 3
1939+
ret <8 x float> %259
1940+
}
1941+
1942+
define <8 x float> @buildvec_vfredusum_slideup_mid_undef(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1943+
; CHECK-LABEL: buildvec_vfredusum_slideup_mid_undef:
1944+
; CHECK: # %bb.0:
1945+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1946+
; CHECK-NEXT: vfmv.s.f v16, fa0
1947+
; CHECK-NEXT: vfredusum.vs v9, v8, v16
1948+
; CHECK-NEXT: vfredusum.vs v10, v10, v16
1949+
; CHECK-NEXT: vfredusum.vs v11, v12, v16
1950+
; CHECK-NEXT: vfredusum.vs v8, v14, v16
1951+
; CHECK-NEXT: vfmv.f.s fa5, v9
1952+
; CHECK-NEXT: vfmv.f.s fa4, v10
1953+
; CHECK-NEXT: vfmv.f.s fa3, v11
1954+
; CHECK-NEXT: vfslide1up.vf v10, v8, fa3
1955+
; CHECK-NEXT: vslideup.vi v8, v10, 4
1956+
; CHECK-NEXT: vfslide1up.vf v10, v8, fa4
1957+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
1958+
; CHECK-NEXT: ret
1959+
%252 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1960+
%253 = insertelement <8 x float> poison, float %252, i64 0
1961+
%254 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
1962+
%255 = insertelement <8 x float> %253, float %254, i64 1
1963+
%256 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1964+
%257 = insertelement <8 x float> %255, float %256, i64 6
1965+
%258 = tail call reassoc float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1966+
%259 = insertelement <8 x float> %257, float %258, i64 7
1967+
ret <8 x float> %259
1968+
}
1969+
1970+
define <4 x float> @buildvec_vfredosum_slideup(float %start, <8 x float> %arg1, <8 x float> %arg2, <8 x float> %arg3, <8 x float> %arg4) nounwind {
1971+
; CHECK-LABEL: buildvec_vfredosum_slideup:
1972+
; CHECK: # %bb.0:
1973+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
1974+
; CHECK-NEXT: vfmv.s.f v16, fa0
1975+
; CHECK-NEXT: vfredosum.vs v8, v8, v16
1976+
; CHECK-NEXT: vfredosum.vs v9, v10, v16
1977+
; CHECK-NEXT: vfredosum.vs v10, v12, v16
1978+
; CHECK-NEXT: vfmv.f.s fa5, v8
1979+
; CHECK-NEXT: vfmv.f.s fa4, v9
1980+
; CHECK-NEXT: vfmv.f.s fa3, v10
1981+
; CHECK-NEXT: vfredosum.vs v8, v14, v16
1982+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1983+
; CHECK-NEXT: vfslide1up.vf v9, v8, fa3
1984+
; CHECK-NEXT: vfslide1up.vf v10, v9, fa4
1985+
; CHECK-NEXT: vfslide1up.vf v8, v10, fa5
1986+
; CHECK-NEXT: ret
1987+
%247 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg1)
1988+
%248 = insertelement <4 x float> poison, float %247, i64 0
1989+
%250 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg2)
1990+
%251 = insertelement <4 x float> %248, float %250, i64 1
1991+
%252 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg3)
1992+
%253 = insertelement <4 x float> %251, float %252, i64 2
1993+
%254 = tail call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %arg4)
1994+
%255 = insertelement <4 x float> %253, float %254, i64 3
1995+
ret <4 x float> %255
1996+
}

0 commit comments

Comments
 (0)