@@ -250,6 +250,31 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
250250 return false ;
251251}
252252
253+ // / Return true if it's a non-all-zeros, interleaving mask. For instance,
254+ // / 111000111000 is interleaved from three 1010 masks.
255+ // / \p SubMask returns the mask of individual lane.
256+ static bool isInterleavedConstantMask (unsigned Factor, ConstantVector *Mask,
257+ SmallVectorImpl<Constant *> &LaneMask) {
258+ unsigned LaneMaskLen = LaneMask.size ();
259+ if (auto *Splat = Mask->getSplatValue ()) {
260+ // All-zeros mask.
261+ if (Splat->isZeroValue ())
262+ return false ;
263+ // All-ones mask.
264+ std::fill (LaneMask.begin (), LaneMask.end (),
265+ ConstantInt::getTrue (Mask->getContext ()));
266+ } else {
267+ for (unsigned Idx = 0U , N = LaneMaskLen * Factor; Idx < N; ++Idx) {
268+ Constant *Ref = Mask->getAggregateElement ((Idx / Factor) * Factor);
269+ if (Ref != Mask->getAggregateElement (Idx))
270+ return false ;
271+ LaneMask[Idx / Factor] = Ref;
272+ }
273+ }
274+
275+ return true ;
276+ }
277+
253278bool InterleavedAccessImpl::lowerInterleavedLoad (
254279 Instruction *LoadOp, SmallSetVector<Instruction *, 32 > &DeadInsts) {
255280 if (isa<ScalableVectorType>(LoadOp->getType ()))
@@ -261,8 +286,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
261286 } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
262287 assert (VPLoad->getIntrinsicID () == Intrinsic::vp_load);
263288 // Require a constant mask and evl.
264- if (!isa<ConstantVector>(VPLoad->getArgOperand (1 )) ||
265- !isa<ConstantInt>(VPLoad->getArgOperand (2 )))
289+ if (!isa<ConstantVector>(VPLoad->getArgOperand (1 )))
266290 return false ;
267291 } else {
268292 llvm_unreachable (" unsupported load operation" );
@@ -315,24 +339,6 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
315339 NumLoadElements))
316340 return false ;
317341
318- // If this is a vp.load, record its mask (NOT shuffle mask).
319- BitVector MaskedIndices (NumLoadElements);
320- if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
321- auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand (1 ));
322- assert (cast<FixedVectorType>(Mask->getType ())->getNumElements () ==
323- NumLoadElements);
324- if (auto *Splat = Mask->getSplatValue ()) {
325- // All-zeros mask, bail out early.
326- if (Splat->isZeroValue ())
327- return false ;
328- } else {
329- for (unsigned i = 0U ; i < NumLoadElements; ++i) {
330- if (Mask->getAggregateElement (i)->isZeroValue ())
331- MaskedIndices.set (i);
332- }
333- }
334- }
335-
336342 // Holds the corresponding index for each DE-interleave shuffle.
337343 SmallVector<unsigned , 4 > Indices;
338344
@@ -373,48 +379,35 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
373379 bool BinOpShuffleChanged =
374380 replaceBinOpShuffles (BinOpShuffles.getArrayRef (), Shuffles, LoadOp);
375381
376- // Check if we extract only the unmasked elements.
377- if (MaskedIndices.any ()) {
378- if (any_of (Shuffles, [&](const auto *Shuffle) {
379- ArrayRef<int > ShuffleMask = Shuffle->getShuffleMask ();
380- for (int Idx : ShuffleMask) {
381- if (Idx < 0 )
382- continue ;
383- if (MaskedIndices.test (unsigned (Idx)))
384- return true ;
385- }
386- return false ;
387- })) {
388- LLVM_DEBUG (dbgs () << " IA: trying to extract a masked element through "
389- << " shufflevector\n " );
390- return false ;
391- }
392- }
393- // Check if we extract only the elements within evl.
382+ // Check if the de-interleaved vp.load masks are the same.
383+ unsigned ShuffleMaskLen = Shuffles[0 ]->getShuffleMask ().size ();
384+ SmallVector<Constant *, 8 > LaneMask (ShuffleMaskLen, nullptr );
394385 if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
395- uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand (2 ))->getZExtValue ();
396- if (any_of (Shuffles, [&](const auto *Shuffle) {
397- ArrayRef<int > ShuffleMask = Shuffle->getShuffleMask ();
398- for (int Idx : ShuffleMask) {
399- if (Idx < 0 )
400- continue ;
401- if (unsigned (Idx) >= EVL)
402- return true ;
403- }
404- return false ;
405- })) {
406- LLVM_DEBUG (
407- dbgs () << " IA: trying to extract an element out of EVL range\n " );
386+ if (!isInterleavedConstantMask (
387+ Factor, cast<ConstantVector>(VPLoad->getArgOperand (1 )), LaneMask))
408388 return false ;
409- }
410389 }
411390
412391 LLVM_DEBUG (dbgs () << " IA: Found an interleaved load: " << *LoadOp << " \n " );
413392
414- // Try to create target specific intrinsics to replace the load and shuffles.
415- if (!TLI->lowerInterleavedLoad (LoadOp, Shuffles, Indices, Factor)) {
416- // If Extracts is not empty, tryReplaceExtracts made changes earlier.
417- return !Extracts.empty () || BinOpShuffleChanged;
393+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
394+ auto *MaskVec = ConstantVector::get (LaneMask);
395+ // Sometimes the number of Shuffles might be less than Factor, we have to
396+ // fill the gaps with null. Also, lowerDeinterleavedVPLoad
397+ // expects them to be sorted.
398+ SmallVector<Value *, 4 > ShuffleValues (Factor, nullptr );
399+ for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
400+ ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
401+ if (!TLI->lowerDeinterleavedVPLoad (VPLoad, MaskVec, ShuffleValues))
402+ // If Extracts is not empty, tryReplaceExtracts made changes earlier.
403+ return !Extracts.empty () || BinOpShuffleChanged;
404+ } else {
405+ // Try to create target specific intrinsics to replace the load and
406+ // shuffles.
407+ if (!TLI->lowerInterleavedLoad (cast<LoadInst>(LoadOp), Shuffles, Indices,
408+ Factor))
409+ // If Extracts is not empty, tryReplaceExtracts made changes earlier.
410+ return !Extracts.empty () || BinOpShuffleChanged;
418411 }
419412
420413 DeadInsts.insert_range (Shuffles);
@@ -530,9 +523,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
530523 StoredValue = SI->getValueOperand ();
531524 } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
532525 assert (VPStore->getIntrinsicID () == Intrinsic::vp_store);
533- // Require a constant mask and evl.
534- if (!isa<ConstantVector>(VPStore->getArgOperand (2 )) ||
535- !isa<ConstantInt>(VPStore->getArgOperand (3 )))
526+ // Require a constant mask.
527+ if (!isa<ConstantVector>(VPStore->getArgOperand (2 )))
536528 return false ;
537529 StoredValue = VPStore->getArgOperand (0 );
538530 } else {
@@ -545,53 +537,53 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
545537
546538 unsigned NumStoredElements =
547539 cast<FixedVectorType>(SVI->getType ())->getNumElements ();
548- // If this is a vp.store, record its mask (NOT shuffle mask).
549- BitVector MaskedIndices (NumStoredElements);
550- if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
551- auto *Mask = cast<ConstantVector>(VPStore->getArgOperand (2 ));
552- assert (cast<FixedVectorType>(Mask->getType ())->getNumElements () ==
553- NumStoredElements);
554- if (auto *Splat = Mask->getSplatValue ()) {
555- // All-zeros mask, bail out early.
556- if (Splat->isZeroValue ())
557- return false ;
558- } else {
559- for (unsigned i = 0U ; i < NumStoredElements; ++i) {
560- if (Mask->getAggregateElement (i)->isZeroValue ())
561- MaskedIndices.set (i);
562- }
563- }
564- }
565-
566540 // Check if the shufflevector is RE-interleave shuffle.
567541 unsigned Factor;
568542 if (!isReInterleaveMask (SVI, Factor, MaxFactor))
569543 return false ;
544+ assert (NumStoredElements % Factor == 0 &&
545+ " number of stored element should be a multiple of Factor" );
570546
571- // Check if we store only the unmasked elements.
572- if (MaskedIndices.any ()) {
573- if (any_of (SVI->getShuffleMask (), [&](int Idx) {
574- return Idx >= 0 && MaskedIndices.test (unsigned (Idx));
575- })) {
576- LLVM_DEBUG (dbgs () << " IA: trying to store a masked element\n " );
577- return false ;
578- }
579- }
580- // Check if we store only the elements within evl.
547+ // Check if the de-interleaved vp.store masks are the same.
548+ unsigned LaneMaskLen = NumStoredElements / Factor;
549+ SmallVector<Constant *, 8 > LaneMask (LaneMaskLen, nullptr );
581550 if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
582- uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand (3 ))->getZExtValue ();
583- if (any_of (SVI->getShuffleMask (),
584- [&](int Idx) { return Idx >= 0 && unsigned (Idx) >= EVL; })) {
585- LLVM_DEBUG (dbgs () << " IA: trying to store an element out of EVL range\n " );
551+ if (!isInterleavedConstantMask (
552+ Factor, cast<ConstantVector>(VPStore->getArgOperand (2 )), LaneMask))
586553 return false ;
587- }
588554 }
589555
590556 LLVM_DEBUG (dbgs () << " IA: Found an interleaved store: " << *StoreOp << " \n " );
591557
592- // Try to create target specific intrinsics to replace the store and shuffle.
593- if (!TLI->lowerInterleavedStore (StoreOp, SVI, Factor))
594- return false ;
558+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
559+ IRBuilder<> Builder (VPStore);
560+ // We need to effectively de-interleave the shufflemask
561+ // because lowerInterleavedVPStore expected individual de-interleaved
562+ // values.
563+ SmallVector<Value *, 10 > NewShuffles;
564+ SmallVector<int , 16 > NewShuffleMask (LaneMaskLen);
565+ auto ShuffleMask = SVI->getShuffleMask ();
566+
567+ for (unsigned i = 0 ; i < Factor; i++) {
568+ for (unsigned j = 0 ; j < LaneMaskLen; j++)
569+ NewShuffleMask[j] = ShuffleMask[i + Factor * j];
570+
571+ NewShuffles.push_back (Builder.CreateShuffleVector (
572+ SVI->getOperand (0 ), SVI->getOperand (1 ), NewShuffleMask));
573+ }
574+
575+ // Try to create target specific intrinsics to replace the vp.store and
576+ // shuffle.
577+ if (!TLI->lowerInterleavedVPStore (VPStore, ConstantVector::get (LaneMask),
578+ NewShuffles))
579+ // We already created new shuffles.
580+ return true ;
581+ } else {
582+ // Try to create target specific intrinsics to replace the store and
583+ // shuffle.
584+ if (!TLI->lowerInterleavedStore (cast<StoreInst>(StoreOp), SVI, Factor))
585+ return false ;
586+ }
595587
596588 // Already have a new target specific interleaved store. Erase the old store.
597589 DeadInsts.insert (StoreOp);
@@ -806,8 +798,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
806798
807799 // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
808800 // TLI function to emit target-specific interleaved instruction.
809- if (!TLI->lowerDeinterleavedIntrinsicToVPLoad (VPLoad, Mask,
810- DeinterleaveValues))
801+ if (!TLI->lowerDeinterleavedVPLoad (VPLoad, Mask, DeinterleaveValues))
811802 return false ;
812803
813804 } else {
@@ -859,8 +850,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
859850
860851 // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
861852 // TLI function to emit target-specific interleaved instruction.
862- if (!TLI->lowerInterleavedIntrinsicToVPStore (VPStore, Mask,
863- InterleaveValues))
853+ if (!TLI->lowerInterleavedVPStore (VPStore, Mask, InterleaveValues))
864854 return false ;
865855 } else {
866856 auto *SI = cast<StoreInst>(StoredBy);
0 commit comments