|
97 | 97 | #include <cctype> |
98 | 98 | #include <cstdint> |
99 | 99 | #include <cstdlib> |
| 100 | +#include <deque> |
100 | 101 | #include <iterator> |
101 | 102 | #include <limits> |
102 | 103 | #include <optional> |
@@ -18041,12 +18042,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, |
18041 | 18042 | ShuffleVectorInst *SVI, |
18042 | 18043 | unsigned Factor, |
18043 | 18044 | const APInt &GapMask) const { |
18044 | | - |
18045 | | - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && |
18046 | | - "Invalid interleave factor"); |
| 18045 | + assert(Factor >= 2 && "Invalid interleave factor"); |
18047 | 18046 | auto *SI = dyn_cast<StoreInst>(Store); |
18048 | 18047 | if (!SI) |
18049 | 18048 | return false; |
| 18049 | + |
| 18050 | + if (Factor > getMaxSupportedInterleaveFactor()) |
| 18051 | + return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); |
| 18052 | + |
18050 | 18053 | assert(!LaneMask && GapMask.popcount() == Factor && |
18051 | 18054 | "Unexpected mask on store"); |
18052 | 18055 |
|
@@ -18192,6 +18195,151 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, |
18192 | 18195 | return true; |
18193 | 18196 | } |
18194 | 18197 |
|
| 18198 | +/// If the interleaved vector elements are greater than supported MaxFactor, |
| 18199 | +/// interleaving the data with additional shuffles can be used to |
| 18200 | +/// achieve the same. |
| 18201 | +/// |
| 18202 | +/// Consider the following data with 8 interleaves which are shuffled to store |
| 18203 | +/// stN instructions. Data needs to be stored in this order: |
| 18204 | +/// [v0, v1, v2, v3, v4, v5, v6, v7] |
| 18205 | +/// |
| 18206 | +/// v0 v4 v2 v6 v1 v5 v3 v7 |
| 18207 | +/// | | | | | | | | |
| 18208 | +/// \ / \ / \ / \ / |
| 18209 | +/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 |
| 18210 | +/// | | | | |
| 18211 | +/// \ / \ / |
| 18212 | +/// \ / \ / |
| 18213 | +/// \ / \ / |
| 18214 | +/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 |
| 18215 | +/// |
| 18216 | +/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored |
| 18217 | +/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with |
| 18218 | +/// another st4. |
| 18219 | +/// |
| 18220 | +/// For stN = 2, upper half of interleaved data V0, V1 is stored |
| 18221 | +/// with one st2 instruction. Second set V2, V3 is stored with another st2. |
| 18222 | +/// Total of 4 st2's are required here. |
| 18223 | +bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( |
| 18224 | + StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { |
| 18225 | + unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); |
| 18226 | + |
| 18227 | + auto *VecTy = cast<FixedVectorType>(SVI->getType()); |
| 18228 | + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); |
| 18229 | + |
| 18230 | + unsigned LaneLen = VecTy->getNumElements() / Factor; |
| 18231 | + Type *EltTy = VecTy->getElementType(); |
| 18232 | + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); |
| 18233 | + |
| 18234 | + const DataLayout &DL = SI->getModule()->getDataLayout(); |
| 18235 | + bool UseScalable; |
| 18236 | + |
| 18237 | + // Skip if we do not have NEON and skip illegal vector types. We can |
| 18238 | + // "legalize" wide vector types into multiple interleaved accesses as long as |
| 18239 | + // the vector types are divisible by 128. |
| 18240 | + if (!Subtarget->hasNEON() || |
| 18241 | + !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) |
| 18242 | + return false; |
| 18243 | + |
| 18244 | + if (UseScalable) |
| 18245 | + return false; |
| 18246 | + |
| 18247 | + std::deque<Value *> Shuffles; |
| 18248 | + Shuffles.push_back(SVI); |
| 18249 | + unsigned ConcatLevel = Factor; |
| 18250 | + unsigned ConcatElt = Factor * LaneLen; |
| 18251 | + // Getting all the interleaved operands. |
| 18252 | + while (ConcatLevel > 1) { |
| 18253 | + unsigned InterleavedOperands = Shuffles.size(); |
| 18254 | + for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) { |
| 18255 | + auto *V = Shuffles.front(); |
| 18256 | + Shuffles.pop_front(); |
| 18257 | + if (isa<ConstantAggregateZero, PoisonValue>(V)) { |
| 18258 | + VectorType *Ty = cast<VectorType>(V->getType()); |
| 18259 | + auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); |
| 18260 | + Value *SplitValue = nullptr; |
| 18261 | + if (isa<ConstantAggregateZero>(V)) |
| 18262 | + SplitValue = ConstantAggregateZero::get(HalfTy); |
| 18263 | + else |
| 18264 | + SplitValue = PoisonValue::get(HalfTy); |
| 18265 | + |
| 18266 | + Shuffles.push_back(SplitValue); |
| 18267 | + Shuffles.push_back(SplitValue); |
| 18268 | + continue; |
| 18269 | + } |
| 18270 | + |
| 18271 | + ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V); |
| 18272 | + if (!SFL) |
| 18273 | + return false; |
| 18274 | + if (SVI != SFL && !SFL->isIdentityMask(SFL->getShuffleMask(), ConcatElt)) |
| 18275 | + return false; |
| 18276 | + |
| 18277 | + Value *Op0 = SFL->getOperand(0); |
| 18278 | + Value *Op1 = SFL->getOperand(1); |
| 18279 | + |
| 18280 | + Shuffles.push_back(dyn_cast<Value>(Op0)); |
| 18281 | + Shuffles.push_back(dyn_cast<Value>(Op1)); |
| 18282 | + } |
| 18283 | + ConcatLevel >>= 1; |
| 18284 | + ConcatElt >>= 1; |
| 18285 | + } |
| 18286 | + |
| 18287 | + IRBuilder<> Builder(SI); |
| 18288 | + auto Mask = createInterleaveMask(LaneLen, 2); |
| 18289 | + SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen); |
| 18290 | + for (unsigned Idx = 0; Idx < LaneLen; Idx++) { |
| 18291 | + LowerHalfMask[Idx] = Mask[Idx]; |
| 18292 | + UpperHalfMask[Idx] = Mask[Idx + LaneLen]; |
| 18293 | + } |
| 18294 | + |
| 18295 | + unsigned InterleaveFactor = Factor >> 1; |
| 18296 | + while (InterleaveFactor >= MaxSupportedFactor) { |
| 18297 | + std::deque<Value *> ShufflesIntermediate; |
| 18298 | + ShufflesIntermediate.resize(Factor); |
| 18299 | + for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) { |
| 18300 | + for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) { |
| 18301 | + assert(Shuffles[Idx + GroupIdx]->getType() == SubVecTy && |
| 18302 | + Shuffles[Idx + GroupIdx + InterleaveFactor]->getType() == |
| 18303 | + SubVecTy && |
| 18304 | + "Type of interleaving candidates are not matching\n"); |
| 18305 | + auto *Shuffle = Builder.CreateShuffleVector( |
| 18306 | + Shuffles[Idx + GroupIdx], |
| 18307 | + Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask); |
| 18308 | + ShufflesIntermediate[Idx + GroupIdx] = Shuffle; |
| 18309 | + Shuffle = Builder.CreateShuffleVector( |
| 18310 | + Shuffles[Idx + GroupIdx], |
| 18311 | + Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask); |
| 18312 | + ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle; |
| 18313 | + } |
| 18314 | + } |
| 18315 | + Shuffles = ShufflesIntermediate; |
| 18316 | + InterleaveFactor >>= 1; |
| 18317 | + } |
| 18318 | + |
| 18319 | + Type *PtrTy = SI->getPointerOperandType(); |
| 18320 | + auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); |
| 18321 | + |
| 18322 | + Value *BaseAddr = SI->getPointerOperand(); |
| 18323 | + Function *StNFunc = getStructuredStoreFunction( |
| 18324 | + SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); |
| 18325 | + for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) { |
| 18326 | + SmallVector<Value *, 5> Ops; |
| 18327 | + for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++) |
| 18328 | + Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]); |
| 18329 | + |
| 18330 | + if (N > 0) { |
| 18331 | + // We will compute the pointer operand of each store from the original |
| 18332 | + // base address using GEPs. Cast the base address to a pointer to the |
| 18333 | + // scalar element type. |
| 18334 | + BaseAddr = Builder.CreateConstGEP1_32( |
| 18335 | + SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); |
| 18336 | + } |
| 18337 | + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); |
| 18338 | + Builder.CreateCall(StNFunc, Ops); |
| 18339 | + } |
| 18340 | + return true; |
| 18341 | +} |
| 18342 | + |
18195 | 18343 | bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( |
18196 | 18344 | Instruction *Load, Value *Mask, IntrinsicInst *DI) const { |
18197 | 18345 | const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); |
|
0 commit comments