-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[InterleavedAccess] Construct interleaved access store with shuffles #164000
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,6 +96,7 @@ | |
#include <cctype> | ||
#include <cstdint> | ||
#include <cstdlib> | ||
#include <deque> | ||
#include <iterator> | ||
#include <limits> | ||
#include <optional> | ||
|
@@ -18023,11 +18024,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, | |
unsigned Factor, | ||
const APInt &GapMask) const { | ||
|
||
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
"Invalid interleave factor"); | ||
auto *SI = dyn_cast<StoreInst>(Store); | ||
if (!SI) | ||
return false; | ||
|
||
if (isProfitableToInterleaveWithGatherScatter() && | ||
Factor > getMaxSupportedInterleaveFactor()) | ||
return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); | ||
|
||
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
"Invalid interleave factor"); | ||
|
||
assert(!LaneMask && GapMask.popcount() == Factor && | ||
"Unexpected mask on store"); | ||
|
||
|
@@ -18173,6 +18180,139 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, | |
return true; | ||
} | ||
|
||
/// If the interleaved vector elements are greater than supported MaxFactor, | ||
/// interleaving the data with additional shuffles can be used to | ||
/// achieve the same. | ||
/// | ||
/// Consider the following data with 8 interleaves which are shuffled to store | ||
/// stN instructions. Data needs to be stored in this order: | ||
/// [v0, v1, v2, v3, v4, v5, v6, v7] | ||
/// | ||
/// v0 v4 v2 v6 v1 v5 v3 v7 | ||
/// | | | | | | | | | ||
/// \ / \ / \ / \ / | ||
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 | ||
/// | | | | | ||
/// \ / \ / | ||
/// \ / \ / | ||
/// \ / \ / | ||
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 | ||
/// | ||
/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored | ||
/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with | ||
/// another st4. | ||
/// | ||
/// For stN = 2, upper half of interleaved data V0, V1 is stored | ||
/// with one st2 instruction. Second set V2, V3 is stored with another st2. | ||
/// Total of 4 st2's are required here. | ||
bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( | ||
StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { | ||
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); | ||
|
||
auto *VecTy = cast<FixedVectorType>(SVI->getType()); | ||
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In InterleavedAccessPass, type of shuffle is checking. So I didn't verify it again. |
||
|
||
unsigned LaneLen = VecTy->getNumElements() / Factor; | ||
Type *EltTy = VecTy->getElementType(); | ||
auto *SubVecTy = FixedVectorType::get(EltTy, Factor); | ||
|
||
const DataLayout &DL = SI->getModule()->getDataLayout(); | ||
bool UseScalable; | ||
|
||
// Skip if we do not have NEON and skip illegal vector types. We can | ||
// "legalize" wide vector types into multiple interleaved accesses as long as | ||
// the vector types are divisible by 128. | ||
if (!Subtarget->hasNEON() || | ||
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) | ||
return false; | ||
|
||
if (UseScalable) | ||
return false; | ||
|
||
std::deque<Value *> Shuffles; | ||
Shuffles.push_back(SVI); | ||
unsigned ConcatLevel = Factor; | ||
while (ConcatLevel > 1) { | ||
std::deque<Value *> ShufflesIntermediate; | ||
ShufflesIntermediate = Shuffles; | ||
Shuffles.clear(); | ||
while (!ShufflesIntermediate.empty()) { | ||
ShuffleVectorInst *SFL = | ||
dyn_cast<ShuffleVectorInst>(ShufflesIntermediate.front()); | ||
if (!SFL) | ||
break; | ||
ShufflesIntermediate.pop_front(); | ||
|
||
Value *Op0 = SFL->getOperand(0); | ||
Value *Op1 = SFL->getOperand(1); | ||
|
||
Shuffles.push_back(dyn_cast<Value>(Op0)); | ||
Shuffles.push_back(dyn_cast<Value>(Op1)); | ||
} | ||
if (!ShufflesIntermediate.empty()) { | ||
Shuffles = ShufflesIntermediate; | ||
break; | ||
} | ||
ConcatLevel = ConcatLevel >> 1; | ||
} | ||
Comment on lines
+18232
to
+18257
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can do this with just one deque here and the last row would be in hand at the end? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It can be done with single deque, but the logic become complex. I need this code to be very simple. |
||
|
||
if (Shuffles.size() != Factor) | ||
return false; | ||
|
||
IRBuilder<> Builder(SI); | ||
auto Mask = createInterleaveMask(LaneLen, 2); | ||
SmallVector<int, 16> UpperHalfMask, LowerHalfMask; | ||
for (unsigned i = 0; i < (2 * LaneLen); i++) { | ||
if (i < LaneLen) | ||
LowerHalfMask.push_back(Mask[i]); | ||
else | ||
UpperHalfMask.push_back(Mask[i]); | ||
} | ||
|
||
unsigned InterleaveFactor = Factor >> 1; | ||
while (InterleaveFactor >= MaxSupportedFactor) { | ||
std::deque<Value *> ShufflesIntermediate; | ||
for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) { | ||
for (unsigned i = 0; i < InterleaveFactor; i++) { | ||
auto *Shuffle = Builder.CreateShuffleVector( | ||
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask); | ||
ShufflesIntermediate.push_back(Shuffle); | ||
} | ||
for (unsigned i = 0; i < InterleaveFactor; i++) { | ||
auto *Shuffle = Builder.CreateShuffleVector( | ||
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask); | ||
ShufflesIntermediate.push_back(Shuffle); | ||
} | ||
} | ||
Comment on lines
+18275
to
+18286
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inner two loops could be done with one like you did above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not possible to fuse two inner loops. All the lower half shuffles need to be together and after that all upper half of shuffles. Then only I can access the same shuffles list again with index to interleave again or prepare operands for stN call. |
||
|
||
Shuffles = ShufflesIntermediate; | ||
InterleaveFactor = InterleaveFactor >> 1; | ||
} | ||
|
||
Type *PtrTy = SI->getPointerOperandType(); | ||
auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); | ||
|
||
Value *BaseAddr = SI->getPointerOperand(); | ||
Function *StNFunc = getStructuredStoreFunction( | ||
SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); | ||
for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) { | ||
SmallVector<Value *, 5> Ops; | ||
for (unsigned j = 0; j < MaxSupportedFactor; j++) | ||
Ops.push_back(Shuffles[i * MaxSupportedFactor + j]); | ||
|
||
if (i > 0) { | ||
// We will compute the pointer operand of each store from the original | ||
// base address using GEPs. Cast the base address to a pointer to the | ||
// scalar element type. | ||
BaseAddr = Builder.CreateConstGEP1_32( | ||
SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); | ||
} | ||
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); | ||
Builder.CreateCall(StNFunc, Ops); | ||
} | ||
return true; | ||
} | ||
|
||
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( | ||
Instruction *Load, Value *Mask, IntrinsicInst *DI) const { | ||
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can get rid of this condition since we already check
Factor > getMaxSupportedInterleaveFactor()
above.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some cases may return false by isProfitableToInterleaveWithGatherScatter(). For those cases, I kept those assertion.