Skip to content

Commit 102b917

Browse files
committed
[InterleavedAccess] Construct interleaved access store with shuffles
- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.
1 parent a9633aa commit 102b917

File tree

9 files changed

+600
-60
lines changed

9 files changed

+600
-60
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase {
32323232
/// Default to be the minimum interleave factor: 2.
32333233
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
32343234

3235+
/// Return true if the target has interleave with shuffles.
3236+
virtual bool hasInterleaveWithGatherScatter() const { return false; }
3237+
32353238
/// Lower an interleaved load to target specific intrinsics. Return
32363239
/// true on success.
32373240
///

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
239239
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
240240
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
241241
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
242-
unsigned MaxFactor) {
242+
unsigned MaxFactor,
243+
bool InterleaveWithShuffles) {
243244
unsigned NumElts = SVI->getShuffleMask().size();
244245
if (NumElts < 4)
245246
return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
250251
return true;
251252
}
252253

254+
if (InterleaveWithShuffles) {
255+
for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
256+
Factor = i * MaxFactor;
257+
if (SVI->isInterleave(Factor))
258+
return true;
259+
}
260+
}
253261
return false;
254262
}
255263

@@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
528536
cast<FixedVectorType>(SVI->getType())->getNumElements();
529537
// Check if the shufflevector is RE-interleave shuffle.
530538
unsigned Factor;
531-
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
539+
if (!isReInterleaveMask(SVI, Factor, MaxFactor,
540+
TLI->hasInterleaveWithGatherScatter()))
532541
return false;
533542
assert(NumStoredElements % Factor == 0 &&
534543
"number of stored element should be a multiple of Factor");

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 151 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
#include <cctype>
9898
#include <cstdint>
9999
#include <cstdlib>
100+
#include <deque>
100101
#include <iterator>
101102
#include <limits>
102103
#include <optional>
@@ -18041,12 +18042,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1804118042
ShuffleVectorInst *SVI,
1804218043
unsigned Factor,
1804318044
const APInt &GapMask) const {
18044-
18045-
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18046-
"Invalid interleave factor");
18045+
assert(Factor >= 2 && "Invalid interleave factor");
1804718046
auto *SI = dyn_cast<StoreInst>(Store);
1804818047
if (!SI)
1804918048
return false;
18049+
18050+
if (Factor > getMaxSupportedInterleaveFactor())
18051+
return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
18052+
1805018053
assert(!LaneMask && GapMask.popcount() == Factor &&
1805118054
"Unexpected mask on store");
1805218055

@@ -18192,6 +18195,151 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1819218195
return true;
1819318196
}
1819418197

18198+
/// If the interleaved vector elements are greater than supported MaxFactor,
18199+
/// interleaving the data with additional shuffles can be used to
18200+
/// achieve the same.
18201+
///
18202+
/// Consider the following data with 8 interleaves which are shuffled to store
18203+
/// stN instructions. Data needs to be stored in this order:
18204+
/// [v0, v1, v2, v3, v4, v5, v6, v7]
18205+
///
18206+
/// v0 v4 v2 v6 v1 v5 v3 v7
18207+
/// | | | | | | | |
18208+
/// \ / \ / \ / \ /
18209+
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4
18210+
/// | | | |
18211+
/// \ / \ /
18212+
/// \ / \ /
18213+
/// \ / \ /
18214+
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
18215+
///
18216+
/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
18217+
/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
18218+
/// another st4.
18219+
///
18220+
/// For stN = 2, upper half of interleaved data V0, V1 is stored
18221+
/// with one st2 instruction. Second set V2, V3 is stored with another st2.
18222+
/// Total of 4 st2's are required here.
18223+
bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
18224+
StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
18225+
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
18226+
18227+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
18228+
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18229+
18230+
unsigned LaneLen = VecTy->getNumElements() / Factor;
18231+
Type *EltTy = VecTy->getElementType();
18232+
auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18233+
18234+
const DataLayout &DL = SI->getModule()->getDataLayout();
18235+
bool UseScalable;
18236+
18237+
// Skip if we do not have NEON and skip illegal vector types. We can
18238+
// "legalize" wide vector types into multiple interleaved accesses as long as
18239+
// the vector types are divisible by 128.
18240+
if (!Subtarget->hasNEON() ||
18241+
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18242+
return false;
18243+
18244+
if (UseScalable)
18245+
return false;
18246+
18247+
std::deque<Value *> Shuffles;
18248+
Shuffles.push_back(SVI);
18249+
unsigned ConcatLevel = Factor;
18250+
unsigned ConcatElt = Factor * LaneLen;
18251+
// Getting all the interleaved operands.
18252+
while (ConcatLevel > 1) {
18253+
unsigned InterleavedOperands = Shuffles.size();
18254+
for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) {
18255+
auto *V = Shuffles.front();
18256+
Shuffles.pop_front();
18257+
if (isa<ConstantAggregateZero, PoisonValue>(V)) {
18258+
VectorType *Ty = cast<VectorType>(V->getType());
18259+
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
18260+
Value *SplitValue = nullptr;
18261+
if (isa<ConstantAggregateZero>(V))
18262+
SplitValue = ConstantAggregateZero::get(HalfTy);
18263+
else
18264+
SplitValue = PoisonValue::get(HalfTy);
18265+
18266+
Shuffles.push_back(SplitValue);
18267+
Shuffles.push_back(SplitValue);
18268+
continue;
18269+
}
18270+
18271+
ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V);
18272+
if (!SFL)
18273+
return false;
18274+
if (SVI != SFL && !SFL->isIdentityMask(SFL->getShuffleMask(), ConcatElt))
18275+
return false;
18276+
18277+
Value *Op0 = SFL->getOperand(0);
18278+
Value *Op1 = SFL->getOperand(1);
18279+
18280+
Shuffles.push_back(dyn_cast<Value>(Op0));
18281+
Shuffles.push_back(dyn_cast<Value>(Op1));
18282+
}
18283+
ConcatLevel >>= 1;
18284+
ConcatElt >>= 1;
18285+
}
18286+
18287+
IRBuilder<> Builder(SI);
18288+
auto Mask = createInterleaveMask(LaneLen, 2);
18289+
SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
18290+
for (unsigned Idx = 0; Idx < LaneLen; Idx++) {
18291+
LowerHalfMask[Idx] = Mask[Idx];
18292+
UpperHalfMask[Idx] = Mask[Idx + LaneLen];
18293+
}
18294+
18295+
unsigned InterleaveFactor = Factor >> 1;
18296+
while (InterleaveFactor >= MaxSupportedFactor) {
18297+
std::deque<Value *> ShufflesIntermediate;
18298+
ShufflesIntermediate.resize(Factor);
18299+
for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) {
18300+
for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) {
18301+
assert(Shuffles[Idx + GroupIdx]->getType() == SubVecTy &&
18302+
Shuffles[Idx + GroupIdx + InterleaveFactor]->getType() ==
18303+
SubVecTy &&
18304+
"Type of interleaving candidates are not matching\n");
18305+
auto *Shuffle = Builder.CreateShuffleVector(
18306+
Shuffles[Idx + GroupIdx],
18307+
Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask);
18308+
ShufflesIntermediate[Idx + GroupIdx] = Shuffle;
18309+
Shuffle = Builder.CreateShuffleVector(
18310+
Shuffles[Idx + GroupIdx],
18311+
Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask);
18312+
ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle;
18313+
}
18314+
}
18315+
Shuffles = ShufflesIntermediate;
18316+
InterleaveFactor >>= 1;
18317+
}
18318+
18319+
Type *PtrTy = SI->getPointerOperandType();
18320+
auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18321+
18322+
Value *BaseAddr = SI->getPointerOperand();
18323+
Function *StNFunc = getStructuredStoreFunction(
18324+
SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
18325+
for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) {
18326+
SmallVector<Value *, 5> Ops;
18327+
for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++)
18328+
Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]);
18329+
18330+
if (N > 0) {
18331+
// We will compute the pointer operand of each store from the original
18332+
// base address using GEPs. Cast the base address to a pointer to the
18333+
// scalar element type.
18334+
BaseAddr = Builder.CreateConstGEP1_32(
18335+
SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
18336+
}
18337+
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
18338+
Builder.CreateCall(StNFunc, Ops);
18339+
}
18340+
return true;
18341+
}
18342+
1819518343
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1819618344
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
1819718345
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering {
229229

230230
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
231231

232+
bool hasInterleaveWithGatherScatter() const override { return true; }
233+
232234
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
233235

234236
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering {
239241
ShuffleVectorInst *SVI, unsigned Factor,
240242
const APInt &GapMask) const override;
241243

244+
bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
245+
unsigned Factor) const;
246+
242247
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
243248
IntrinsicInst *DI) const override;
244249

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4932,19 +4932,39 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
49324932
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
49334933
return InstructionCost::getInvalid();
49344934

4935-
if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4935+
unsigned NumLoadStores = 1;
4936+
InstructionCost ShuffleCost = 0;
4937+
bool isInterleaveWithShuffle = false;
4938+
unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
4939+
4940+
auto *SubVecTy =
4941+
VectorType::get(VecVTy->getElementType(),
4942+
VecVTy->getElementCount().divideCoefficientBy(Factor));
4943+
4944+
if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store &&
4945+
(0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) {
4946+
isInterleaveWithShuffle = true;
4947+
4948+
NumLoadStores = Factor / MaxSupportedFactor;
4949+
ShuffleCost =
4950+
(Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
4951+
{}, CostKind, 0, SubVecTy));
4952+
}
4953+
4954+
if (!UseMaskForGaps &&
4955+
(Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
49364956
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4937-
auto *SubVecTy =
4938-
VectorType::get(VecVTy->getElementType(),
4939-
VecVTy->getElementCount().divideCoefficientBy(Factor));
49404957

49414958
// ldN/stN only support legal vector types of size 64 or 128 in bits.
49424959
// Accesses having vector types that are a multiple of 128 bits can be
49434960
// matched to more than one ldN/stN instruction.
49444961
bool UseScalable;
49454962
if (MinElts % Factor == 0 &&
49464963
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4947-
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4964+
return (Factor *
4965+
TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
4966+
NumLoadStores) +
4967+
ShuffleCost;
49484968
}
49494969

49504970
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

0 commit comments

Comments
 (0)