Skip to content

Commit 0efaebf

Browse files
committed
[InterleavedAccess] Construct interleaved access store with shuffles
- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.
1 parent a22834a commit 0efaebf

File tree

8 files changed

+560
-18
lines changed

8 files changed

+560
-18
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase {
32323232
/// Default to be the minimum interleave factor: 2.
32333233
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
32343234

3235+
/// Return true if the target has interleave with shuffles.
3236+
virtual bool hasInterleaveWithGatherScatter() const { return false; }
3237+
32353238
/// Lower an interleaved load to target specific intrinsics. Return
32363239
/// true on success.
32373240
///

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
239239
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
240240
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
241241
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
242-
unsigned MaxFactor) {
242+
unsigned MaxFactor,
243+
bool InterleaveWithShuffles) {
243244
unsigned NumElts = SVI->getShuffleMask().size();
244245
if (NumElts < 4)
245246
return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
250251
return true;
251252
}
252253

254+
if (InterleaveWithShuffles) {
255+
for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
256+
Factor = i * MaxFactor;
257+
if (SVI->isInterleave(Factor))
258+
return true;
259+
}
260+
}
253261
return false;
254262
}
255263

@@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
528536
cast<FixedVectorType>(SVI->getType())->getNumElements();
529537
// Check if the shufflevector is RE-interleave shuffle.
530538
unsigned Factor;
531-
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
539+
if (!isReInterleaveMask(SVI, Factor, MaxFactor,
540+
TLI->hasInterleaveWithGatherScatter()))
532541
return false;
533542
assert(NumStoredElements % Factor == 0 &&
534543
"number of stored element should be a multiple of Factor");

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 146 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
#include <cctype>
9898
#include <cstdint>
9999
#include <cstdlib>
100+
#include <deque>
100101
#include <iterator>
101102
#include <limits>
102103
#include <optional>
@@ -18010,12 +18011,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1801018011
ShuffleVectorInst *SVI,
1801118012
unsigned Factor,
1801218013
const APInt &GapMask) const {
18013-
18014-
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18015-
"Invalid interleave factor");
18014+
assert(Factor >= 2 && "Invalid interleave factor");
1801618015
auto *SI = dyn_cast<StoreInst>(Store);
1801718016
if (!SI)
1801818017
return false;
18018+
18019+
if (Factor > getMaxSupportedInterleaveFactor())
18020+
return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
18021+
1801918022
assert(!LaneMask && GapMask.popcount() == Factor &&
1802018023
"Unexpected mask on store");
1802118024

@@ -18161,6 +18164,146 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1816118164
return true;
1816218165
}
1816318166

18167+
/// If the interleaved vector elements are greater than supported MaxFactor,
18168+
/// interleaving the data with additional shuffles can be used to
18169+
/// achieve the same.
18170+
///
18171+
/// Consider the following data with 8 interleaves which are shuffled to store
18172+
/// stN instructions. Data needs to be stored in this order:
18173+
/// [v0, v1, v2, v3, v4, v5, v6, v7]
18174+
///
18175+
/// v0 v4 v2 v6 v1 v5 v3 v7
18176+
/// | | | | | | | |
18177+
/// \ / \ / \ / \ /
18178+
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4
18179+
/// | | | |
18180+
/// \ / \ /
18181+
/// \ / \ /
18182+
/// \ / \ /
18183+
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
18184+
///
18185+
/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
18186+
/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
18187+
/// another st4.
18188+
///
18189+
/// For stN = 2, upper half of interleaved data V0, V1 is stored
18190+
/// with one st2 instruction. Second set V2, V3 is stored with another st2.
18191+
/// Total of 4 st2's are required here.
18192+
bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
18193+
StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
18194+
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
18195+
18196+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
18197+
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18198+
18199+
unsigned LaneLen = VecTy->getNumElements() / Factor;
18200+
Type *EltTy = VecTy->getElementType();
18201+
auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
18202+
18203+
const DataLayout &DL = SI->getModule()->getDataLayout();
18204+
bool UseScalable;
18205+
18206+
// Skip if we do not have NEON and skip illegal vector types. We can
18207+
// "legalize" wide vector types into multiple interleaved accesses as long as
18208+
// the vector types are divisible by 128.
18209+
if (!Subtarget->hasNEON() ||
18210+
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18211+
return false;
18212+
18213+
if (UseScalable)
18214+
return false;
18215+
18216+
std::deque<Value *> Shuffles;
18217+
Shuffles.push_back(SVI);
18218+
unsigned ConcatLevel = Factor;
18219+
// Getting all the interleaved operands.
18220+
while (ConcatLevel > 1) {
18221+
unsigned InterleavedOperands = Shuffles.size();
18222+
for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) {
18223+
auto *V = Shuffles.front();
18224+
Shuffles.pop_front();
18225+
if (isa<ConstantAggregateZero, UndefValue>(V)) {
18226+
VectorType *Ty = cast<VectorType>(V->getType());
18227+
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
18228+
Value *SplitValue = nullptr;
18229+
if (isa<ConstantAggregateZero>(V))
18230+
SplitValue = ConstantAggregateZero::get(HalfTy);
18231+
else if (isa<PoisonValue>(V))
18232+
SplitValue = PoisonValue::get(HalfTy);
18233+
else if (isa<UndefValue>(V))
18234+
SplitValue = UndefValue::get(HalfTy);
18235+
Shuffles.push_back(SplitValue);
18236+
Shuffles.push_back(SplitValue);
18237+
continue;
18238+
}
18239+
18240+
ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V);
18241+
if (!SFL)
18242+
return false;
18243+
if (SVI != SFL && !SFL->isConcat())
18244+
return false;
18245+
18246+
Value *Op0 = SFL->getOperand(0);
18247+
Value *Op1 = SFL->getOperand(1);
18248+
18249+
Shuffles.push_back(dyn_cast<Value>(Op0));
18250+
Shuffles.push_back(dyn_cast<Value>(Op1));
18251+
}
18252+
ConcatLevel >>= 1;
18253+
}
18254+
18255+
IRBuilder<> Builder(SI);
18256+
auto Mask = createInterleaveMask(LaneLen, 2);
18257+
SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
18258+
for (unsigned Idx = 0; Idx < LaneLen; Idx++) {
18259+
LowerHalfMask[Idx] = Mask[Idx];
18260+
UpperHalfMask[Idx] = Mask[Idx + LaneLen];
18261+
}
18262+
18263+
unsigned InterleaveFactor = Factor >> 1;
18264+
while (InterleaveFactor >= MaxSupportedFactor) {
18265+
std::deque<Value *> ShufflesIntermediate;
18266+
ShufflesIntermediate.resize(Factor);
18267+
for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) {
18268+
for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) {
18269+
auto *Shuffle = Builder.CreateShuffleVector(
18270+
Shuffles[Idx + GroupIdx],
18271+
Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask);
18272+
ShufflesIntermediate[Idx + GroupIdx] = Shuffle;
18273+
Shuffle = Builder.CreateShuffleVector(
18274+
Shuffles[Idx + GroupIdx],
18275+
Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask);
18276+
ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle;
18277+
}
18278+
}
18279+
Shuffles = ShufflesIntermediate;
18280+
InterleaveFactor >>= 1;
18281+
}
18282+
18283+
Type *PtrTy = SI->getPointerOperandType();
18284+
auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18285+
18286+
Value *BaseAddr = SI->getPointerOperand();
18287+
Function *StNFunc = getStructuredStoreFunction(
18288+
SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
18289+
for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) {
18290+
SmallVector<Value *, 5> Ops;
18291+
for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++)
18292+
Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]);
18293+
18294+
if (N > 0) {
18295+
// We will compute the pointer operand of each store from the original
18296+
// base address using GEPs. Cast the base address to a pointer to the
18297+
// scalar element type.
18298+
BaseAddr = Builder.CreateConstGEP1_32(
18299+
SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
18300+
}
18301+
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
18302+
Builder.CreateCall(StNFunc, Ops);
18303+
}
18304+
return true;
18305+
}
18306+
1816418307
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1816518308
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
1816618309
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering {
229229

230230
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
231231

232+
bool hasInterleaveWithGatherScatter() const override { return true; }
233+
232234
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
233235

234236
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering {
239241
ShuffleVectorInst *SVI, unsigned Factor,
240242
const APInt &GapMask) const override;
241243

244+
bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
245+
unsigned Factor) const;
246+
242247
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
243248
IntrinsicInst *DI) const override;
244249

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4922,19 +4922,46 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
49224922
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
49234923
return InstructionCost::getInvalid();
49244924

4925-
if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4925+
unsigned NumLoadStores = 1;
4926+
InstructionCost ShuffleCost = 0;
4927+
bool isInterleaveWithShuffle = false;
4928+
unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
4929+
4930+
auto *SubVecTy =
4931+
VectorType::get(VecVTy->getElementType(),
4932+
VecVTy->getElementCount().divideCoefficientBy(Factor));
4933+
4934+
if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store &&
4935+
(0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) {
4936+
isInterleaveWithShuffle = true;
4937+
SmallVector<int, 16> Mask;
4938+
// preparing interleave Mask.
4939+
for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
4940+
i++) {
4941+
for (unsigned j = 0; j < 2; j++)
4942+
Mask.push_back(j * Factor + i);
4943+
}
4944+
4945+
NumLoadStores = Factor / MaxSupportedFactor;
4946+
ShuffleCost =
4947+
(Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
4948+
Mask, CostKind, 0, SubVecTy));
4949+
}
4950+
4951+
if (!UseMaskForGaps &&
4952+
(Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
49264953
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4927-
auto *SubVecTy =
4928-
VectorType::get(VecVTy->getElementType(),
4929-
VecVTy->getElementCount().divideCoefficientBy(Factor));
49304954

49314955
// ldN/stN only support legal vector types of size 64 or 128 in bits.
49324956
// Accesses having vector types that are a multiple of 128 bits can be
49334957
// matched to more than one ldN/stN instruction.
49344958
bool UseScalable;
49354959
if (MinElts % Factor == 0 &&
49364960
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4937-
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4961+
return (Factor *
4962+
TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
4963+
NumLoadStores) +
4964+
ShuffleCost;
49384965
}
49394966

49404967
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

0 commit comments

Comments
 (0)