-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[IA][RISCV] Recognize deinterleaved loads that could lower to strided segmented loads #151612
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
c722014
9d6ef18
95f772e
7bb4ec3
f5507fb
8e4b79e
3f992d6
1f5ab33
705b7a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -268,13 +268,16 @@ static Value *getMaskOperand(IntrinsicInst *II) { | |
| } | ||
| } | ||
|
|
||
| // Return the corresponded deinterleaved mask, or nullptr if there is no valid | ||
| // mask. | ||
| static Value *getMask(Value *WideMask, unsigned Factor, | ||
| ElementCount LeafValueEC); | ||
|
|
||
| static Value *getMask(Value *WideMask, unsigned Factor, | ||
| VectorType *LeafValueTy) { | ||
| // Return a pair of | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We talked about this offline, but I'm more and more coming to the view we should have made these a set of utility routines (usable by each target), and simply passed the mask operand through (or maybe not even that.) More of an aside for longer term consideration than a comment on this review. |
||
| // (1) The corresponded deinterleaved mask, or nullptr if there is no valid | ||
| // mask. | ||
| // (2) Some mask effectively skips a certain field, and this element is a mask | ||
| // in which inactive lanes represent fields that are skipped (i.e. "gaps"). | ||
| static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, | ||
| ElementCount LeafValueEC); | ||
|
|
||
| static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, | ||
| VectorType *LeafValueTy) { | ||
| return getMask(WideMask, Factor, LeafValueTy->getElementCount()); | ||
| } | ||
|
|
||
|
|
@@ -379,22 +382,26 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( | |
| replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); | ||
|
|
||
| Value *Mask = nullptr; | ||
| APInt GapMask(Factor, 0); | ||
|
||
| if (LI) { | ||
| GapMask.setAllBits(); | ||
| LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); | ||
| } else { | ||
| // Check mask operand. Handle both all-true/false and interleaved mask. | ||
| Mask = getMask(getMaskOperand(II), Factor, VecTy); | ||
| std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy); | ||
| if (!Mask) | ||
| return false; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " | ||
| << *Load << "\n"); | ||
| LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor | ||
| << " and actual factor " << GapMask.popcount() << "\n"); | ||
| } | ||
|
|
||
| // Try to create target specific intrinsics to replace the load and | ||
| // shuffles. | ||
| if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles, | ||
| Indices, Factor)) | ||
| Indices, Factor, GapMask)) | ||
| // If Extracts is not empty, tryReplaceExtracts made changes earlier. | ||
| return !Extracts.empty() || BinOpShuffleChanged; | ||
|
|
||
|
|
@@ -536,10 +543,15 @@ bool InterleavedAccessImpl::lowerInterleavedStore( | |
| } else { | ||
| // Check mask operand. Handle both all-true/false and interleaved mask. | ||
| unsigned LaneMaskLen = NumStoredElements / Factor; | ||
| Mask = getMask(getMaskOperand(II), Factor, | ||
| ElementCount::getFixed(LaneMaskLen)); | ||
| APInt GapMask(Factor, 0); | ||
| std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, | ||
| ElementCount::getFixed(LaneMaskLen)); | ||
| if (!Mask) | ||
| return false; | ||
| // We haven't supported gap mask for stores. Yet it is possible that we | ||
| // already changed the IR, hence returning true here. | ||
| if (GapMask.popcount() != Factor) | ||
| return true; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " | ||
| << *Store << "\n"); | ||
|
|
@@ -556,34 +568,65 @@ bool InterleavedAccessImpl::lowerInterleavedStore( | |
| return true; | ||
| } | ||
|
|
||
| static Value *getMask(Value *WideMask, unsigned Factor, | ||
| ElementCount LeafValueEC) { | ||
| // A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the | ||
| // last field in a factor-of-three interleaved store or deinterleaved load (in | ||
| // which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. | ||
| // This helper function tries to detect this pattern and return the actual | ||
| // factor we're accessing, which is 2 in this example. | ||
| static void getGapMask(const Constant &MaskConst, unsigned Factor, | ||
| unsigned LeafMaskLen, APInt &GapMask) { | ||
| assert(GapMask.getBitWidth() == Factor); | ||
| for (unsigned F = 0U; F < Factor; ++F) { | ||
| bool AllZero = true; | ||
| for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { | ||
| Constant *C = MaskConst.getAggregateElement(F + Idx * Factor); | ||
| if (!C->isZeroValue()) { | ||
| AllZero = false; | ||
| break; | ||
| } | ||
| } | ||
| // All mask bits on this field are zero, skipping it. | ||
| if (AllZero) | ||
| GapMask.clearBit(F); | ||
| } | ||
| } | ||
|
|
||
| static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, | ||
| ElementCount LeafValueEC) { | ||
| APInt GapMask(Factor, 0); | ||
|
||
| GapMask.setAllBits(); | ||
|
|
||
| if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) { | ||
| if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID()); | ||
| F && F == Factor && llvm::all_equal(IMI->args())) { | ||
preames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return IMI->getArgOperand(0); | ||
| return {IMI->getArgOperand(0), GapMask}; | ||
| } | ||
| } | ||
|
|
||
| if (auto *ConstMask = dyn_cast<Constant>(WideMask)) { | ||
| if (auto *Splat = ConstMask->getSplatValue()) | ||
| // All-ones or all-zeros mask. | ||
| return ConstantVector::getSplat(LeafValueEC, Splat); | ||
| return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask}; | ||
|
|
||
| if (LeafValueEC.isFixed()) { | ||
| unsigned LeafMaskLen = LeafValueEC.getFixedValue(); | ||
| // First, check if we use a gap mask to skip some of the factors / fields. | ||
| getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask); | ||
|
|
||
| SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr); | ||
| // If this is a fixed-length constant mask, each lane / leaf has to | ||
| // use the same mask. This is done by checking if every group with Factor | ||
| // number of elements in the interleaved mask has homogeneous values. | ||
| for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { | ||
| if (!GapMask[Idx % Factor]) | ||
| continue; | ||
| Constant *C = ConstMask->getAggregateElement(Idx); | ||
| if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) | ||
| return nullptr; | ||
| return {nullptr, GapMask}; | ||
| LeafMask[Idx / Factor] = C; | ||
| } | ||
|
|
||
| return ConstantVector::get(LeafMask); | ||
| return {ConstantVector::get(LeafMask), GapMask}; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -603,12 +646,13 @@ static Value *getMask(Value *WideMask, unsigned Factor, | |
| auto *LeafMaskTy = | ||
| VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC); | ||
| IRBuilder<> Builder(SVI); | ||
| return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), | ||
| uint64_t(0)); | ||
| return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0), | ||
| uint64_t(0)), | ||
| GapMask}; | ||
| } | ||
| } | ||
|
|
||
| return nullptr; | ||
| return {nullptr, GapMask}; | ||
| } | ||
|
|
||
| bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( | ||
|
|
@@ -639,9 +683,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( | |
| return false; | ||
|
|
||
| // Check mask operand. Handle both all-true/false and interleaved mask. | ||
| Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); | ||
| APInt GapMask(Factor, 0); | ||
| std::tie(Mask, GapMask) = | ||
| getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI)); | ||
| if (!Mask) | ||
| return false; | ||
| // We haven't supported gap mask if it's deinterleaving using intrinsics. | ||
| // Yet it is possible that we already changed the IR, hence returning true | ||
| // here. | ||
| if (GapMask.popcount() != Factor) | ||
| return true; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" | ||
| << " intrinsic " << *DI << " and factor = " | ||
|
|
@@ -680,10 +731,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( | |
| II->getIntrinsicID() != Intrinsic::vp_store) | ||
| return false; | ||
| // Check mask operand. Handle both all-true/false and interleaved mask. | ||
| Mask = getMask(getMaskOperand(II), Factor, | ||
| cast<VectorType>(InterleaveValues[0]->getType())); | ||
| APInt GapMask(Factor, 0); | ||
| std::tie(Mask, GapMask) = | ||
| getMask(getMaskOperand(II), Factor, | ||
| cast<VectorType>(InterleaveValues[0]->getType())); | ||
| if (!Mask) | ||
| return false; | ||
| // We haven't supported gap mask if it's interleaving using intrinsics. Yet | ||
| // it is possible that we already changed the IR, hence returning true here. | ||
| if (GapMask.popcount() != Factor) | ||
| return true; | ||
|
|
||
| LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" | ||
| << " intrinsic " << *IntII << " and factor = " | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, | |
| /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 | ||
| bool AArch64TargetLowering::lowerInterleavedLoad( | ||
| Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, | ||
| ArrayRef<unsigned> Indices, unsigned Factor) const { | ||
| ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { | ||
| assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
| "Invalid interleave factor"); | ||
| assert(!Shuffles.empty() && "Empty shufflevector input"); | ||
|
|
@@ -17265,6 +17265,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad( | |
| if (!LI) | ||
| return false; | ||
| assert(!Mask && "Unexpected mask on a load"); | ||
| assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); | ||
|
|
||
|
||
| const DataLayout &DL = LI->getDataLayout(); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { | |
| /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 | ||
| bool ARMTargetLowering::lowerInterleavedLoad( | ||
| Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, | ||
| ArrayRef<unsigned> Indices, unsigned Factor) const { | ||
| ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { | ||
| assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
| "Invalid interleave factor"); | ||
| assert(!Shuffles.empty() && "Empty shufflevector input"); | ||
|
|
@@ -21610,6 +21610,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( | |
| if (!LI) | ||
| return false; | ||
| assert(!Mask && "Unexpected mask on a load"); | ||
| assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); | ||
|
||
|
|
||
| auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); | ||
| Type *EltTy = VecTy->getElementType(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { | |
| // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. | ||
| bool X86TargetLowering::lowerInterleavedLoad( | ||
| Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, | ||
| ArrayRef<unsigned> Indices, unsigned Factor) const { | ||
| ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const { | ||
| assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && | ||
| "Invalid interleave factor"); | ||
| assert(!Shuffles.empty() && "Empty shufflevector input"); | ||
|
|
@@ -813,6 +813,7 @@ bool X86TargetLowering::lowerInterleavedLoad( | |
| if (!LI) | ||
| return false; | ||
| assert(!Mask && "Unexpected mask on a load"); | ||
| assert(GapMask.popcount() == Factor && "Unexpected factor reduction"); | ||
|
||
|
|
||
| // Create an interleaved access group. | ||
| IRBuilder<> Builder(LI); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are always skilled makes it sound like 1 means not used. Possibly "may be accessed".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated