Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3179,15 +3179,15 @@ class TargetLoweringBase {
return false;
}

/// Lower an interleaved load to target specific intrinsics. Return
/// Lower a deinterleaved load to target specific intrinsics. Return
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should still be "interleaved load" to match the terminology used in the header description at the top of InterleavedAccessPass.cpp and the loop vectorizer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, it's fixed now.

/// true on success.
///
/// \p Load is a vp.load instruction.
/// \p Mask is a mask value
/// \p DeinterleaveRes is a list of deinterleaved results.
virtual bool
lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveRes) const {
lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be lowerInterleavedVPLoad to match lowerInterleavedLoad

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

ArrayRef<Value *> DeinterleaveRes) const {
return false;
}

Expand All @@ -3197,9 +3197,8 @@ class TargetLoweringBase {
/// \p Store is the vp.store instruction.
/// \p Mask is a mask value
/// \p InterleaveOps is a list of values being interleaved.
virtual bool
lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const {
virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const {
return false;
}

Expand Down
20 changes: 20 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsRISCV.td
Original file line number Diff line number Diff line change
Expand Up @@ -1705,19 +1705,39 @@ let TargetPrefix = "riscv" in {

// Segment loads/stores for fixed vectors.
foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
// Input: (pointer, vl)
def int_riscv_seg # nf # _load
: DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
!listsplat(LLVMMatchType<0>,
!add(nf, -1))),
[llvm_anyptr_ty, llvm_anyint_ty],
[NoCapture<ArgIndex<0>>, IntrReadMem]>;
// Input: (pointer, mask, vl)
def int_riscv_seg # nf # _load_mask
: DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
!listsplat(LLVMMatchType<0>,
!add(nf, -1))),
[llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
llvm_anyint_ty],
[NoCapture<ArgIndex<0>>, IntrReadMem]>;

// Input: (<stored values>, pointer, vl)
def int_riscv_seg # nf # _store
: DefaultAttrsIntrinsic<[],
!listconcat([llvm_anyvector_ty],
!listsplat(LLVMMatchType<0>,
!add(nf, -1)),
[llvm_anyptr_ty, llvm_anyint_ty]),
[NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
// Input: (<stored values>, pointer, mask, vl)
def int_riscv_seg # nf # _store_mask
: DefaultAttrsIntrinsic<[],
!listconcat([llvm_anyvector_ty],
!listsplat(LLVMMatchType<0>,
!add(nf, -1)),
[llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
llvm_anyint_ty]),
[NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
}

} // TargetPrefix = "riscv"
Expand Down
181 changes: 144 additions & 37 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ class InterleavedAccessImpl {
unsigned MaxFactor = 0u;

/// Transform an interleaved load into target specific intrinsics.
bool lowerInterleavedLoad(LoadInst *LI,
bool lowerInterleavedLoad(Instruction *LoadOp,
SmallSetVector<Instruction *, 32> &DeadInsts);

/// Transform an interleaved store into target specific intrinsics.
bool lowerInterleavedStore(StoreInst *SI,
bool lowerInterleavedStore(Instruction *StoreOp,
SmallSetVector<Instruction *, 32> &DeadInsts);

/// Transform a load and a deinterleave intrinsic into target specific
Expand All @@ -131,7 +131,7 @@ class InterleavedAccessImpl {
/// made.
bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
LoadInst *LI);
Instruction *LI);
};

class InterleavedAccess : public FunctionPass {
Expand Down Expand Up @@ -249,11 +249,43 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
return false;
}

/// Return true if it's an interleaving mask. For instance, 111000111000 is
/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
/// lane.
static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
SmallVectorImpl<Constant *> &LaneMask) {
unsigned LaneMaskLen = LaneMask.size();
if (auto *Splat = Mask->getSplatValue()) {
std::fill(LaneMask.begin(), LaneMask.end(), Splat);
} else {
for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
if (Ref != Mask->getAggregateElement(Idx))
return false;
LaneMask[Idx / Factor] = Ref;
}
}

return true;
}

bool InterleavedAccessImpl::lowerInterleavedLoad(
LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this just be Load? I'm not sure the Op is providing any value.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

if (isa<ScalableVectorType>(LoadOp->getType()))
return false;

if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
if (!LI->isSimple())
return false;
} else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
// Require a constant mask and evl.
if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
if (!isa<ConstantVector>(VPLoad->getMaskParam()))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

return false;
} else {
llvm_unreachable("unsupported load operation");
}

// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
// them to later check if they can be modified to extract from one of the
Expand All @@ -265,7 +297,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// binop are the same load.
SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;

for (auto *User : LI->users()) {
for (auto *User : LoadOp->users()) {
auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
Expand Down Expand Up @@ -294,7 +326,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
unsigned Factor, Index;

unsigned NumLoadElements =
cast<FixedVectorType>(LI->getType())->getNumElements();
cast<FixedVectorType>(LoadOp->getType())->getNumElements();
auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
Expand Down Expand Up @@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(

assert(Shuffle->getShuffleMask().size() <= NumLoadElements);

if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
Indices.push_back(Index);
if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
Indices.push_back(Index);
}

Expand All @@ -339,25 +371,48 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
return false;

bool BinOpShuffleChanged =
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);

LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);

// Check if the de-interleaved vp.load masks are the same.
unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
if (!isInterleavedConstantMask(
Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
return false;
}

// Try to create target specific intrinsics to replace the load and shuffles.
if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");

if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
auto *MaskVec = ConstantVector::get(LaneMask);
// Sometimes the number of Shuffles might be less than Factor, we have to
// fill the gaps with null. Also, lowerDeinterleavedVPLoad
// expects them to be sorted.
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an issue that only affects vp loads? Does the non-vp lowerInterleavedLoad need to worry about this too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because TLI::lowerDeinterleavedVPLoad uses the number of shuffle / leaf values as the factor, that's why we have to pad ShuffleValues with null until it has a size of Factor. On the contrary, TLI::lowerInterleavedLoad has Factor explicitly passed into it. It also has the Indices argument to specify the order of shufflevector instructions.

Copy link
Contributor

@lukel97 lukel97 May 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, thanks for explaining it. Would be simpler to add a Factor argument to lowerDeinterleavedVPLoad to bring it inline with lowerInterleavedLoad? Non-blocking, just a suggestion.

EDIT: Actually I think this is probably best left to a separate PR where we clean up the TTI hooks, like you originally suggested.

if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
} else {
// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
Factor))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}

DeadInsts.insert_range(Shuffles);

DeadInsts.insert(LI);
DeadInsts.insert(LoadOp);
return true;
}

bool InterleavedAccessImpl::replaceBinOpShuffles(
ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
for (auto *SVI : BinOpShuffles) {
BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
Type *BIOp0Ty = BI->getOperand(0)->getType();
Expand All @@ -380,9 +435,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
<< "\n With : " << *NewSVI1 << "\n And : "
<< *NewSVI2 << "\n And : " << *NewBI << "\n");
RecursivelyDeleteTriviallyDeadInstructions(SVI);
if (NewSVI1->getOperand(0) == LI)
if (NewSVI1->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI1);
if (NewSVI2->getOperand(0) == LI)
if (NewSVI2->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI2);
}

Expand Down Expand Up @@ -454,27 +509,78 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
}

bool InterleavedAccessImpl::lowerInterleavedStore(
StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
if (!SI->isSimple())
return false;
Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
Value *StoredValue;
if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
if (!SI->isSimple())
return false;
StoredValue = SI->getValueOperand();
} else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
// Require a constant mask.
if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
if (!isa<ConstantVector>(VPStore->getMaskParam()))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

return false;
StoredValue = VPStore->getArgOperand(0);
} else {
llvm_unreachable("unsupported store operation");
}

auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;

unsigned NumStoredElements =
cast<FixedVectorType>(SVI->getType())->getNumElements();
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
return false;
Comment on lines 522 to 524
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but just noting that the VP and non-VP paths in this function don't really share much code at all. It's probably justified if you want to split these out into two separate functions. lowerInterleavedLoad might need more refactoring though

assert(NumStoredElements % Factor == 0 &&
"number of stored element should be a multiple of Factor");

// Check if the de-interleaved vp.store masks are the same.
unsigned LaneMaskLen = NumStoredElements / Factor;
SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

empty block?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is likely caused by the git merge (which started to worry me that there are other places like this). I'll remove it in the next update

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fixed now

if (!isInterleavedConstantMask(
Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
return false;
}

LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");

// Try to create target specific intrinsics to replace the store and shuffle.
if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
return false;
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
IRBuilder<> Builder(VPStore);
// We need to effectively de-interleave the shufflemask
// because lowerInterleavedVPStore expected individual de-interleaved
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

expected -> expects?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

// values.
SmallVector<Value *, 10> NewShuffles;
SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
auto ShuffleMask = SVI->getShuffleMask();

for (unsigned i = 0; i < Factor; i++) {
for (unsigned j = 0; j < LaneMaskLen; j++)
NewShuffleMask[j] = ShuffleMask[i + Factor * j];

NewShuffles.push_back(Builder.CreateShuffleVector(
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
Comment on lines +550 to +551
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting it would be nice to not have to create the shuffles in case TLI->lowerInterleavedVPStore bails and we end up leaving them around. But I don't have any good suggestions for now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I believe a similar thing happened in InterleaveAccessImpl::lowerInterleavedLoad: we might create new shufflevectors when canonicalizing the extractions, even if we bail out later in TLI::lowerInterleavedLoad.

}

// Try to create target specific intrinsics to replace the vp.store and
// shuffle.
if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
NewShuffles))
// We already created new shuffles.
return true;
} else {
// Try to create target specific intrinsics to replace the store and
// shuffle.
if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
return false;
}

// Already have a new target specific interleaved store. Erase the old store.
DeadInsts.insert(SI);
DeadInsts.insert(StoreOp);
DeadInsts.insert(SVI);
return true;
}
Expand Down Expand Up @@ -686,8 +792,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(

// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
// TLI function to emit target-specific interleaved instruction.
if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
DeinterleaveValues))
if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
return false;

} else {
Expand Down Expand Up @@ -739,8 +844,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(

// Since lowerInterleavedStore expects Shuffle and StoreInst, use special
// TLI function to emit target-specific interleaved instruction.
if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
InterleaveValues))
if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
return false;
} else {
auto *SI = cast<StoreInst>(StoredBy);
Expand All @@ -766,12 +870,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
SmallSetVector<Instruction *, 32> DeadInsts;
bool Changed = false;

using namespace PatternMatch;
for (auto &I : instructions(F)) {
if (auto *LI = dyn_cast<LoadInst>(&I))
Changed |= lowerInterleavedLoad(LI, DeadInsts);
if (match(&I, m_CombineOr(m_Load(m_Value()),
m_Intrinsic<Intrinsic::vp_load>())))
Changed |= lowerInterleavedLoad(&I, DeadInsts);

if (auto *SI = dyn_cast<StoreInst>(&I))
Changed |= lowerInterleavedStore(SI, DeadInsts);
if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
m_Intrinsic<Intrinsic::vp_store>())))
Changed |= lowerInterleavedStore(&I, DeadInsts);

if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
Expand Down
Loading
Loading