Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,25 +504,26 @@ void llvm::processShuffleMasks(
unsigned SzSrc = Sz / NumOfSrcRegs;
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
auto &RegMasks = Res[I];
RegMasks.assign(NumOfSrcRegs, {});
RegMasks.assign(2 * NumOfSrcRegs, {});
// Check that the values in dest registers are in the one src
// register.
for (unsigned K = 0; K < SzDest; ++K) {
int Idx = I * SzDest + K;
if (Idx == Sz)
break;
if (Mask[Idx] >= Sz || Mask[Idx] == PoisonMaskElem)
if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
continue;
int SrcRegIdx = Mask[Idx] / SzSrc;
int MaskIdx = Mask[Idx] % Sz;
int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
// Add a cost of PermuteTwoSrc for each new source register permute,
// if we have more than one source registers.
if (RegMasks[SrcRegIdx].empty())
RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
RegMasks[SrcRegIdx][K] = Mask[Idx] % SzSrc;
RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
}
}
// Process split mask.
for (unsigned I = 0; I < NumOfUsedRegs; ++I) {
for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
auto &Dest = Res[I];
int NumSrcRegs =
count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
Expand Down Expand Up @@ -567,7 +568,7 @@ void llvm::processShuffleMasks(
int FirstIdx = -1;
SecondIdx = -1;
MutableArrayRef<int> FirstMask, SecondMask;
for (unsigned I = 0; I < NumOfDestRegs; ++I) {
for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
SmallVectorImpl<int> &RegMask = Dest[I];
if (RegMask.empty())
continue;
Expand Down
93 changes: 52 additions & 41 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5081,7 +5081,6 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
SDValue V1 = SVN->getOperand(0);
SDValue V2 = SVN->getOperand(1);
ArrayRef<int> Mask = SVN->getMask();
unsigned NumElts = VT.getVectorNumElements();

// If we don't know exact data layout, not much we can do. If this
// is already m1 or smaller, no point in splitting further.
Expand All @@ -5098,58 +5097,70 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,

MVT ElemVT = VT.getVectorElementType();
unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
unsigned VRegsPerSrc = NumElts / ElemsPerVReg;

SmallVector<std::pair<int, SmallVector<int>>>
OutMasks(VRegsPerSrc, {-1, {}});

// Check if our mask can be done as a 1-to-1 mapping from source
// to destination registers in the group without needing to
// write each destination more than once.
for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
int DstVecIdx = DstIdx / ElemsPerVReg;
int DstSubIdx = DstIdx % ElemsPerVReg;
int SrcIdx = Mask[DstIdx];
if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
continue;
int SrcVecIdx = SrcIdx / ElemsPerVReg;
int SrcSubIdx = SrcIdx % ElemsPerVReg;
if (OutMasks[DstVecIdx].first == -1)
OutMasks[DstVecIdx].first = SrcVecIdx;
if (OutMasks[DstVecIdx].first != SrcVecIdx)
// Note: This case could easily be handled by keeping track of a chain
// of source values and generating two element shuffles below. This is
// less an implementation question, and more a profitability one.
return SDValue();

OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
}

EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
assert(M1VT == getLMUL1VT(M1VT));
unsigned NumOpElts = M1VT.getVectorMinNumElements();
SDValue Vec = DAG.getUNDEF(ContainerVT);
unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think VF is a term used outside of the vectorizers much. Maybe use something like ContainerNumOpElts?

unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
// The following semantically builds up a fixed length concat_vector
// of the component shuffle_vectors. We eagerly lower to scalable here
// to avoid DAG combining it back to a large shuffle_vector again.
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
if (SrcVecIdx == -1)
SmallVector<SDValue> SubRegs(NumOfDestRegs);
unsigned RegCnt = 0;
unsigned PrevCnt = 0;
processShuffleMasks(
Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
[&]() {
PrevCnt = RegCnt;
++RegCnt;
},
[&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx,
unsigned DstVecIdx) {
SDValue SrcVec = SrcVecIdx >= NumOfSrcRegs ? V2 : V1;
unsigned ExtractIdx = (SrcVecIdx % NumOfSrcRegs) * NumOpElts;
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
SubRegs[RegCnt] = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
PrevCnt = RegCnt;
++RegCnt;
},
[&, &DAG = DAG](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2) {
if (PrevCnt + 1 == RegCnt)
++RegCnt;
SDValue SubVec1 = SubRegs[PrevCnt + 1];
if (!SubVec1) {
SDValue SrcVec = Idx1 >= NumOfSrcRegs ? V2 : V1;
unsigned ExtractIdx = (Idx1 % NumOfSrcRegs) * NumOpElts;
SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
}
SubVec1 = convertFromScalableVector(OneRegVT, SubVec1, DAG, Subtarget);
SDValue SrcVec = Idx2 >= NumOfSrcRegs ? V2 : V1;
unsigned ExtractIdx = (Idx2 % NumOfSrcRegs) * NumOpElts;
SDValue SubVec2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
SubVec2 = convertFromScalableVector(OneRegVT, SubVec2, DAG, Subtarget);
SubVec1 =
DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, SrcSubMask);
SubVec1 = convertToScalableVector(M1VT, SubVec1, DAG, Subtarget);
SubRegs[PrevCnt + 1] = SubVec1;
});
assert(RegCnt == NumOfDestRegs && "Whole vector must be processed");
SDValue Vec = DAG.getUNDEF(ContainerVT);
for (auto [I, V] : enumerate(SubRegs)) {
if (!V)
continue;
unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
unsigned InsertIdx = DstVecIdx * NumOpElts;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
unsigned InsertIdx = I * NumOpElts;

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, V,
DAG.getVectorIdxConstant(InsertIdx, DL));
}
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
Expand Down
99 changes: 99 additions & 0 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,105 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// give a more accurate cost than falling back to generic scalable codegen.
// TODO: Each of these cases hints at a modeling gap around scalable vectors.
if (isa<FixedVectorType>(Tp)) {
MVT LegalVT = LT.second;
InstructionCost NumOfDests = LT.first;
if (ST->hasVInstructions() &&
LT.second.getSizeInBits().getFixedValue() >
ST->getRealVLen().value_or(UINT_MAX) &&
!Mask.empty() && NumOfDests.isValid() && NumOfDests > 1 &&
LegalVT.isFixedLengthVector() &&
LegalVT.getVectorElementType().getSizeInBits() ==
Tp->getElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() <
Tp->getElementCount().getFixedValue()) {
unsigned VecTySize = DL.getTypeStoreSize(Tp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:

auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
LegalVT.getVectorNumElements());

// Try to perform better estimation of the permutation.
// 1. Split the source/destination vectors into real registers.
// 2. Do the mask analysis to identify which real registers are
// permuted. If more than 1 source registers are used for the
// destination register building, the cost for this destination register
// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
// source register is used, build mask and calculate the cost as a cost
// of PermuteSingleSrc.
// Also, for the single register permute we try to identify if the
// destination register is just a copy of the source register or the
// copy of the previous destination register (the cost is
// TTI::TCC_Basic). If the source register is just reused, the cost for
// this operation is 0.
NumOfDests = getTypeLegalizationCost(
FixedVectorType::get(Tp->getElementType(), Mask.size()))
.first;
unsigned E = *NumOfDests.getValue();
unsigned NormalizedVF =
LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
copy(Mask, NormalizedMask.begin());
unsigned PrevSrcReg = 0;
ArrayRef<int> PrevRegMask;
InstructionCost Cost = 0;
SmallBitVector ExtractedRegs(2 * NumOfSrcRegs);
processShuffleMasks(
NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
[&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
if (ExtractedRegs.test(SrcReg)) {
Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
(SrcReg % NumOfSrcRegs) *
SingleOpTy->getNumElements(),
SingleOpTy);
ExtractedRegs.set(SrcReg);
}
if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
// Check if the previous register can be just copied to the next
// one.
if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
PrevRegMask != RegMask) {
Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
RegMask, CostKind, 0, nullptr);
} else {
// Just a copy of previous destination register.
Cost += TTI::TCC_Basic;
}
return;
}
if (SrcReg != DestReg &&
any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
// Just a copy of the source register.
Cost += TTI::TCC_Basic;
}
PrevSrcReg = SrcReg;
PrevRegMask = RegMask;
ExtractedRegs.set(DestReg);
},
[&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2) {
if (ExtractedRegs.test(Idx1)) {
Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
(Idx1 % NumOfSrcRegs) *
SingleOpTy->getNumElements(),
SingleOpTy);
ExtractedRegs.set(Idx1);
}
if (ExtractedRegs.test(Idx2)) {
Cost += getShuffleCost(TTI::SK_ExtractSubvector, Tp, {}, CostKind,
(Idx2 % NumOfSrcRegs) *
SingleOpTy->getNumElements(),
SingleOpTy);
ExtractedRegs.set(Idx2);
}
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
CostKind, 0, nullptr);
});
return Cost;
}
switch (Kind) {
default:
break;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/X86/shuffle-splat-codesize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/X86/shuffle-splat-latency.ll
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/X86/shuffle-splat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ define void @test_upper_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a
; SSE-LABEL: 'test_upper_vXf32'
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,11 @@ define <4 x i64> @m2_splat_into_slide_two_source_v2_lo(<4 x i64> %v1, <4 x i64>
define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
; CHECK-LABEL: m2_splat_into_slide_two_source:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 12
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v13, v10, 1
; CHECK-NEXT: vslideup.vi v13, v11, 1
; CHECK-NEXT: vrgather.vi v12, v8, 0
; CHECK-NEXT: vslideup.vi v12, v10, 1, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: ret
%res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
ret <4 x i64> %res
Expand Down