From 69389a32fec23647acce371d2e5bc3740c0dcd6b Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 11 Apr 2025 11:45:19 -0700 Subject: [PATCH 01/12] [IA][RISCV] Add support for vp.load/vp.store with shufflevector --- llvm/include/llvm/CodeGen/TargetLowering.h | 9 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 177 +++++-- .../Target/AArch64/AArch64ISelLowering.cpp | 12 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 12 +- llvm/lib/Target/ARM/ARMISelLowering.h | 4 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 79 +++- llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 +- llvm/lib/Target/X86/X86ISelLowering.h | 4 +- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 12 +- .../rvv/fixed-vectors-interleaved-access.ll | 443 ++++++++++++++++-- 11 files changed, 661 insertions(+), 99 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4f2f202f94841..5407bf8b2ba13 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3157,11 +3157,11 @@ class TargetLoweringBase { /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// - /// \p LI is the vector load instruction. + /// \p LoadOp is a vector load or vp.load instruction. /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedLoad(LoadInst *LI, + virtual bool lowerInterleavedLoad(Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { @@ -3171,10 +3171,11 @@ class TargetLoweringBase { /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// - /// \p SI is the vector store instruction. + /// \p StoreOp is a vector store or vp.store instruction. /// \p SVI is the shufflevector to RE-interleave the stored vector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + virtual bool lowerInterleavedStore(Instruction *StoreOp, + ShuffleVectorInst *SVI, unsigned Factor) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 9e47510e9cd1a..83bde96cc725a 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -45,6 +45,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" @@ -100,11 +101,11 @@ class InterleavedAccessImpl { unsigned MaxFactor = 0u; /// Transform an interleaved load into target specific intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *LoadOp, SmallSetVector &DeadInsts); /// Transform an interleaved store into target specific intrinsics. - bool lowerInterleavedStore(StoreInst *SI, + bool lowerInterleavedStore(Instruction *StoreOp, SmallSetVector &DeadInsts); /// Transform a load and a deinterleave intrinsic into target specific @@ -131,7 +132,7 @@ class InterleavedAccessImpl { /// made. bool replaceBinOpShuffles(ArrayRef BinOpShuffles, SmallVectorImpl &Shuffles, - LoadInst *LI); + Instruction *LI); }; class InterleavedAccess : public FunctionPass { @@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, } bool InterleavedAccessImpl::lowerInterleavedLoad( - LoadInst *LI, SmallSetVector &DeadInsts) { - if (!LI->isSimple() || isa(LI->getType())) + Instruction *LoadOp, SmallSetVector &DeadInsts) { + if (isa(LoadOp->getType())) return false; + if (auto *LI = dyn_cast(LoadOp)) { + if (!LI->isSimple()) + return false; + } else if (auto *VPLoad = dyn_cast(LoadOp)) { + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); + // Require a constant mask and evl. + if (!isa(VPLoad->getArgOperand(1)) || + !isa(VPLoad->getArgOperand(2))) + return false; + } else { + llvm_unreachable("unsupported load operation"); + } + // Check if all users of this load are shufflevectors. If we encounter any // users that are extractelement instructions or binary operators, we save // them to later check if they can be modified to extract from one of the @@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( // binop are the same load. SmallSetVector BinOpShuffles; - for (auto *User : LI->users()) { + for (auto *User : LoadOp->users()) { auto *Extract = dyn_cast(User); if (Extract && isa(Extract->getIndexOperand())) { Extracts.push_back(Extract); @@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( unsigned Factor, Index; unsigned NumLoadElements = - cast(LI->getType())->getNumElements(); + cast(LoadOp->getType())->getNumElements(); auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; // Check if the first shufflevector is DE-interleave shuffle. if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, NumLoadElements)) return false; + // If this is a vp.load, record its mask (NOT shuffle mask). + BitVector MaskedIndices(NumLoadElements); + if (auto *VPLoad = dyn_cast(LoadOp)) { + auto *Mask = cast(VPLoad->getArgOperand(1)); + assert(cast(Mask->getType())->getNumElements() == + NumLoadElements); + if (auto *Splat = Mask->getSplatValue()) { + // All-zeros mask, bail out early. + if (Splat->isZeroValue()) + return false; + } else { + for (unsigned i = 0U; i < NumLoadElements; ++i) { + if (Mask->getAggregateElement(i)->isZeroValue()) + MaskedIndices.set(i); + } + } + } + // Holds the corresponding index for each DE-interleave shuffle. SmallVector Indices; @@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( assert(Shuffle->getShuffleMask().size() <= NumLoadElements); - if (cast(Shuffle->getOperand(0))->getOperand(0) == LI) + if (cast(Shuffle->getOperand(0))->getOperand(0) == LoadOp) Indices.push_back(Index); - if (cast(Shuffle->getOperand(0))->getOperand(1) == LI) + if (cast(Shuffle->getOperand(0))->getOperand(1) == LoadOp) Indices.push_back(Index); } @@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( return false; bool BinOpShuffleChanged = - replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI); + replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp); + + // Check if we extract only the unmasked elements. + if (MaskedIndices.any()) { + if (any_of(Shuffles, [&](const auto *Shuffle) { + ArrayRef ShuffleMask = Shuffle->getShuffleMask(); + for (int Idx : ShuffleMask) { + if (Idx < 0) + continue; + if (MaskedIndices.test(unsigned(Idx))) + return true; + } + return false; + })) { + LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through " + << "shufflevector\n"); + return false; + } + } + // Check if we extract only the elements within evl. + if (auto *VPLoad = dyn_cast(LoadOp)) { + uint64_t EVL = cast(VPLoad->getArgOperand(2))->getZExtValue(); + if (any_of(Shuffles, [&](const auto *Shuffle) { + ArrayRef ShuffleMask = Shuffle->getShuffleMask(); + for (int Idx : ShuffleMask) { + if (Idx < 0) + continue; + if (unsigned(Idx) >= EVL) + return true; + } + return false; + })) { + LLVM_DEBUG( + dbgs() << "IA: trying to extract an element out of EVL range\n"); + return false; + } + } - LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n"); // Try to create target specific intrinsics to replace the load and shuffles. - if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) { + if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) { // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; } DeadInsts.insert_range(Shuffles); - DeadInsts.insert(LI); + DeadInsts.insert(LoadOp); return true; } bool InterleavedAccessImpl::replaceBinOpShuffles( ArrayRef BinOpShuffles, - SmallVectorImpl &Shuffles, LoadInst *LI) { + SmallVectorImpl &Shuffles, Instruction *LoadOp) { for (auto *SVI : BinOpShuffles) { BinaryOperator *BI = cast(SVI->getOperand(0)); Type *BIOp0Ty = BI->getOperand(0)->getType(); @@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles( << "\n With : " << *NewSVI1 << "\n And : " << *NewSVI2 << "\n And : " << *NewBI << "\n"); RecursivelyDeleteTriviallyDeadInstructions(SVI); - if (NewSVI1->getOperand(0) == LI) + if (NewSVI1->getOperand(0) == LoadOp) Shuffles.push_back(NewSVI1); - if (NewSVI2->getOperand(0) == LI) + if (NewSVI2->getOperand(0) == LoadOp) Shuffles.push_back(NewSVI2); } @@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts( } bool InterleavedAccessImpl::lowerInterleavedStore( - StoreInst *SI, SmallSetVector &DeadInsts) { - if (!SI->isSimple()) - return false; + Instruction *StoreOp, SmallSetVector &DeadInsts) { + Value *StoredValue; + if (auto *SI = dyn_cast(StoreOp)) { + if (!SI->isSimple()) + return false; + StoredValue = SI->getValueOperand(); + } else if (auto *VPStore = dyn_cast(StoreOp)) { + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); + // Require a constant mask and evl. + if (!isa(VPStore->getArgOperand(2)) || + !isa(VPStore->getArgOperand(3))) + return false; + StoredValue = VPStore->getArgOperand(0); + } else { + llvm_unreachable("unsupported store operation"); + } - auto *SVI = dyn_cast(SI->getValueOperand()); + auto *SVI = dyn_cast(StoredValue); if (!SVI || !SVI->hasOneUse() || isa(SVI->getType())) return false; + unsigned NumStoredElements = + cast(SVI->getType())->getNumElements(); + // If this is a vp.store, record its mask (NOT shuffle mask). + BitVector MaskedIndices(NumStoredElements); + if (auto *VPStore = dyn_cast(StoreOp)) { + auto *Mask = cast(VPStore->getArgOperand(2)); + assert(cast(Mask->getType())->getNumElements() == + NumStoredElements); + if (auto *Splat = Mask->getSplatValue()) { + // All-zeros mask, bail out early. + if (Splat->isZeroValue()) + return false; + } else { + for (unsigned i = 0U; i < NumStoredElements; ++i) { + if (Mask->getAggregateElement(i)->isZeroValue()) + MaskedIndices.set(i); + } + } + } + // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); + // Check if we store only the unmasked elements. + if (MaskedIndices.any()) { + if (any_of(SVI->getShuffleMask(), [&](int Idx) { + return Idx >= 0 && MaskedIndices.test(unsigned(Idx)); + })) { + LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n"); + return false; + } + } + // Check if we store only the elements within evl. + if (auto *VPStore = dyn_cast(StoreOp)) { + uint64_t EVL = cast(VPStore->getArgOperand(3))->getZExtValue(); + if (any_of(SVI->getShuffleMask(), + [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) { + LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n"); + return false; + } + } + + LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n"); // Try to create target specific intrinsics to replace the store and shuffle. - if (!TLI->lowerInterleavedStore(SI, SVI, Factor)) + if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor)) return false; // Already have a new target specific interleaved store. Erase the old store. - DeadInsts.insert(SI); + DeadInsts.insert(StoreOp); DeadInsts.insert(SVI); return true; } @@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { SmallSetVector DeadInsts; bool Changed = false; + using namespace PatternMatch; for (auto &I : instructions(F)) { - if (auto *LI = dyn_cast(&I)) - Changed |= lowerInterleavedLoad(LI, DeadInsts); + if (match(&I, m_CombineOr(m_Load(m_Value()), + m_Intrinsic()))) + Changed |= lowerInterleavedLoad(&I, DeadInsts); - if (auto *SI = dyn_cast(&I)) - Changed |= lowerInterleavedStore(SI, DeadInsts); + if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()), + m_Intrinsic()))) + Changed |= lowerInterleavedStore(&I, DeadInsts); if (auto *II = dyn_cast(&I)) { // At present, we only have intrinsics to represent (de)interleaving diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e366d7cb54490..d74cc3161684d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(LoadOp); + if (!LI) + return false; + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); @@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) -bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast(StoreOp); + if (!SI) + return false; + auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 0d51ef2be8631..34446abb1474c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2290ac2728c6d..64d12a0eb1d9b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(LoadOp); + if (!LI) + return false; + auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); @@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad( /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) -bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); + auto *SI = dyn_cast(StoreOp); + if (!SI) + return false; + auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 9fad056edd3f1..635a6cd226936 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -673,11 +673,11 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7d192756fd56..9558783963500 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); + const DataLayout &DL = LoadOp->getDataLayout(); + IRBuilder<> Builder(LoadOp); auto *VTy = cast(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), - LI->getDataLayout())) + + Align PtrAlignment; + unsigned PtrAddrSpace; + Value *BaseAddr; + if (auto *LI = dyn_cast(LoadOp)) { + BaseAddr = LI->getPointerOperand(); + PtrAlignment = LI->getAlign(); + PtrAddrSpace = LI->getPointerAddressSpace(); + } else { + auto *VPLoad = cast(LoadOp); + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); + BaseAddr = VPLoad->getArgOperand(0); + PtrAlignment = VPLoad->getParamAlign(0).value_or( + DL.getABITypeAlign(VTy->getElementType())); + PtrAddrSpace = cast(BaseAddr->getType())->getAddressSpace(); + } + + if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace, + DL)) return false; - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + auto *XLenTy = Type::getIntNTy(LoadOp->getContext(), Subtarget.getXLen()); // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -23397,7 +23414,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8; Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); + Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset); Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); Value *VL = Builder.getInt32(VTy->getNumElements()); @@ -23406,16 +23423,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad( {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + 0, Attribute::getWithAlignment(CI->getContext(), PtrAlignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy}, - {LI->getPointerOperand(), VL}); + CallInst *VlsegN = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {VTy, BaseAddr->getType(), XLenTy}, + {BaseAddr, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -23447,21 +23464,39 @@ static const Intrinsic::ID FixedVssegIntrIds[] = { /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vsseg3 instruction in CodeGen. -bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, +bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const { - IRBuilder<> Builder(SI); + const DataLayout &DL = StoreOp->getDataLayout(); + IRBuilder<> Builder(StoreOp); + auto Mask = SVI->getShuffleMask(); auto *ShuffleVTy = cast(SVI->getType()); // Given SVI : , then VTy : auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), ShuffleVTy->getNumElements() / Factor); - if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), - SI->getDataLayout())) + + Align PtrAlignment; + unsigned PtrAddrSpace; + Value *BaseAddr; + if (auto *SI = dyn_cast(StoreOp)) { + BaseAddr = SI->getPointerOperand(); + PtrAlignment = SI->getAlign(); + PtrAddrSpace = SI->getPointerAddressSpace(); + } else { + auto *VPStore = cast(StoreOp); + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); + BaseAddr = VPStore->getArgOperand(1); + PtrAlignment = VPStore->getParamAlign(1).value_or( + DL.getABITypeAlign(VTy->getElementType())); + PtrAddrSpace = cast(BaseAddr->getType())->getAddressSpace(); + } + + if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace, + DL)) return false; - auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + auto *XLenTy = Type::getIntNTy(StoreOp->getContext(), Subtarget.getXLen()); unsigned Index; // If the segment store only has one active lane (i.e. the interleave is @@ -23474,7 +23509,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, auto *DataVTy = cast(Data->getType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); + Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset); Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); Value *VL = Builder.getInt32(VTy->getNumElements()); @@ -23483,14 +23518,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, {Data->getType(), BasePtr->getType(), Stride->getType()}, {Data, BasePtr, Stride, Mask, VL}); CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); + 1, Attribute::getWithAlignment(CI->getContext(), PtrAlignment)); return true; } Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], - {VTy, SI->getPointerOperandType(), XLenTy}); + StoreOp->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, BaseAddr->getType(), XLenTy}); SmallVector Ops; SmallVector NewShuffleMask; @@ -23510,7 +23545,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, // potentially under larger LMULs) because we checked that the fixed vector // type fits in isLegalInterleavedAccessType Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - Ops.append({SI->getPointerOperand(), VL}); + Ops.append({BaseAddr, VL}); Builder.CreateCall(VssegNFunc, Ops); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 5ebdbbd51f2b1..5df510c08e0da 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -903,12 +903,12 @@ class RISCVTargetLowering : public TargetLowering { bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 4a2b35e9efe7c..da5baaf1bf5af 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1650,14 +1650,14 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3b2cd18..99fbc249a1179 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *LoadOp, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,10 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(LoadOp); + if (!LI) + return false; + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, @@ -817,7 +821,7 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, +bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -827,6 +831,10 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, 0 && "Invalid interleaved store"); + auto *SI = dyn_cast(StoreOp); + if (!SI) + return false; + // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index f6bdd45330384..dc7ca2317ec9a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -176,6 +176,164 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7 } +define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) { +; CHECK-LABEL: vpload_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8) + %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + + +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) { +; CHECK-LABEL: vpload_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +; Load a larger vector but only deinterleave a subset of the elements. +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> , i32 12) + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { +; CHECK-LABEL: vpload_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16) + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) { +; CHECK-LABEL: vpload_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor6(ptr %ptr) { +; CHECK-LABEL: vpload_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i16> @llvm.vp.load.v12i16.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v1 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v2 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v3 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v4 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v5 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor7(ptr %ptr) { +; CHECK-LABEL: vpload_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <14 x i16> @llvm.vp.load.v14i16.p0(ptr %ptr, <14 x i1> splat (i1 true), i32 14) + %v0 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v1 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v2 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v3 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v4 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v5 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v6 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor8(ptr %ptr) { +; CHECK-LABEL: vpload_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i16> @llvm.vp.load.v16i16.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16) + %v0 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v1 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v2 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v3 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v4 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v5 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v6 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v7 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6 + %res7 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6, <2 x i16> %v7, 7 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7 +} + ; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) { ; RV32-LABEL: load_factor6_too_big: @@ -192,8 +350,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI8_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI8_0) +; RV32-NEXT: lui a7, %hi(.LCPI16_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI16_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -278,12 +436,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI8_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) +; RV32-NEXT: lui a1, %hi(.LCPI16_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI8_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3) +; RV32-NEXT: lui a4, %hi(.LCPI16_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI16_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -462,8 +620,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI8_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) +; RV32-NEXT: lui a1, %hi(.LCPI16_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -527,16 +685,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI8_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV32-NEXT: lui a2, %hi(.LCPI8_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) +; RV32-NEXT: lui a1, %hi(.LCPI16_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_4) +; RV32-NEXT: lui a2, %hi(.LCPI16_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI16_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) +; RV32-NEXT: lui a1, %hi(.LCPI16_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -564,14 +722,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI8_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) -; RV32-NEXT: lui a2, %hi(.LCPI8_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) +; RV32-NEXT: lui a1, %hi(.LCPI16_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_6) +; RV32-NEXT: lui a2, %hi(.LCPI16_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI16_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) +; RV32-NEXT: lui a1, %hi(.LCPI16_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI16_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -658,8 +816,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI8_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI8_0) +; RV64-NEXT: lui a3, %hi(.LCPI16_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI16_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -847,8 +1005,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI8_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI8_1) +; RV64-NEXT: lui a2, %hi(.LCPI16_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI16_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -882,8 +1040,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI8_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI8_2) +; RV64-NEXT: lui a2, %hi(.LCPI16_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI16_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -967,12 +1125,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) +; RV64-NEXT: lui a1, %hi(.LCPI16_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI16_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI8_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) +; RV64-NEXT: lui a1, %hi(.LCPI16_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI16_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1023,8 +1181,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI8_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) +; RV64-NEXT: lui a1, %hi(.LCPI16_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI16_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1196,6 +1354,73 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2 ret void } +define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: vpstore_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> splat (i1 true), i32 8) + ret void +} + +define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12) + ret void +} + +define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: vpstore_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16) + ret void +} + +define void @vpstore_factor5(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { +; CHECK-LABEL: vpstore_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %s2 = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + %s3 = shufflevector <4 x i32> %v4, <4 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s2, <16 x i32> %s3, <20 x i32> + tail call void @llvm.vp.store.v20i32.p0(<20 x i32> %interleaved.vec, ptr %ptr, <20 x i1> splat (i1 true), i32 20) + ret void +} + +define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) { +; CHECK-LABEL: vpstore_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s3 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s2, <8 x i16> %s3, <12 x i32> + tail call void @llvm.vp.store.v12i16.p0(<12 x i16> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12) + ret void +} + define <4 x i32> @load_factor2_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor2_one_active: @@ -1368,3 +1593,157 @@ define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) { store <16 x i32> %v0, ptr %ptr ret void } + +; Negative tests + +define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { +; RV32-LABEL: invalid_vp_mask: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: addi a1, a1, -33 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI40_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI40_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_vp_mask: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: addi a1, a1, -33 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { +; RV32-LABEL: invalid_vp_evl: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 10, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0) +; RV32-NEXT: li a0, 73 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 8 +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: li a0, 146 +; RV32-NEXT: vmv.s.x v11, a0 +; RV32-NEXT: lui a0, %hi(.LCPI41_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI41_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vcompress.vm v8, v12, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v11 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v20 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_vp_evl: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 10, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: li a0, 73 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 8 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: li a0, 146 +; RV64-NEXT: vmv.s.x v11, a0 +; RV64-NEXT: li a0, 36 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v11 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 10) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} From 66118fd0b8f1372bf573f8d710bf8d82ddbffdc2 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 16 Apr 2025 09:59:00 -0700 Subject: [PATCH 02/12] fixup! Propagate the mask to underlying intrinsics as well --- llvm/include/llvm/CodeGen/TargetLowering.h | 20 +- llvm/include/llvm/IR/IntrinsicsRISCV.td | 20 ++ llvm/lib/CodeGen/InterleavedAccessPass.cpp | 188 +++++------ .../Target/AArch64/AArch64ISelLowering.cpp | 12 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 12 +- llvm/lib/Target/ARM/ARMISelLowering.h | 4 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 313 +++++++++++------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 15 +- llvm/lib/Target/X86/X86ISelLowering.h | 4 +- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 12 +- .../rvv/fixed-vectors-interleaved-access.ll | 109 +++--- .../RISCV/rvv/vp-vector-interleaved-access.ll | 53 --- 13 files changed, 397 insertions(+), 369 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 5407bf8b2ba13..6eaa1bae1da97 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3157,11 +3157,11 @@ class TargetLoweringBase { /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// - /// \p LoadOp is a vector load or vp.load instruction. + /// \p LI is the vector load instruction. /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedLoad(Instruction *LoadOp, + virtual bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { @@ -3171,24 +3171,23 @@ class TargetLoweringBase { /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// - /// \p StoreOp is a vector store or vp.store instruction. + /// \p SI is the vector store instruction. /// \p SVI is the shufflevector to RE-interleave the stored vector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedStore(Instruction *StoreOp, - ShuffleVectorInst *SVI, + virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { return false; } - /// Lower an interleaved load to target specific intrinsics. Return + /// Lower a deinterleaved load to target specific intrinsics. Return /// true on success. /// /// \p Load is a vp.load instruction. /// \p Mask is a mask value /// \p DeinterleaveRes is a list of deinterleaved results. virtual bool - lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const { + lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const { return false; } @@ -3198,9 +3197,8 @@ class TargetLoweringBase { /// \p Store is the vp.store instruction. /// \p Mask is a mask value /// \p InterleaveOps is a list of values being interleaved. - virtual bool - lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const { + virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const { return false; } diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index 99cb557d9aa09..7da11b93f6b74 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1705,12 +1705,23 @@ let TargetPrefix = "riscv" in { // Segment loads/stores for fixed vectors. foreach nf = [2, 3, 4, 5, 6, 7, 8] in { + // Input: (pointer, vl) def int_riscv_seg # nf # _load : DefaultAttrsIntrinsic, !add(nf, -1))), [llvm_anyptr_ty, llvm_anyint_ty], [NoCapture>, IntrReadMem]>; + // Input: (pointer, mask, vl) + def int_riscv_seg # nf # _load_mask + : DefaultAttrsIntrinsic, + !add(nf, -1))), + [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyint_ty], + [NoCapture>, IntrReadMem]>; + + // Input: (, pointer, vl) def int_riscv_seg # nf # _store : DefaultAttrsIntrinsic<[], !listconcat([llvm_anyvector_ty], @@ -1718,6 +1729,15 @@ let TargetPrefix = "riscv" in { !add(nf, -1)), [llvm_anyptr_ty, llvm_anyint_ty]), [NoCapture>, IntrWriteMem]>; + // Input: (, pointer, mask, vl) + def int_riscv_seg # nf # _store_mask + : DefaultAttrsIntrinsic<[], + !listconcat([llvm_anyvector_ty], + !listsplat(LLVMMatchType<0>, + !add(nf, -1)), + [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyint_ty]), + [NoCapture>, IntrWriteMem]>; } } // TargetPrefix = "riscv" diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 83bde96cc725a..4bf05d06256e0 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -250,6 +250,31 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } +/// Return true if it's a non-all-zeros, interleaving mask. For instance, +/// 111000111000 is interleaved from three 1010 masks. +/// \p SubMask returns the mask of individual lane. +static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask, + SmallVectorImpl &LaneMask) { + unsigned LaneMaskLen = LaneMask.size(); + if (auto *Splat = Mask->getSplatValue()) { + // All-zeros mask. + if (Splat->isZeroValue()) + return false; + // All-ones mask. + std::fill(LaneMask.begin(), LaneMask.end(), + ConstantInt::getTrue(Mask->getContext())); + } else { + for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) { + Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor); + if (Ref != Mask->getAggregateElement(Idx)) + return false; + LaneMask[Idx / Factor] = Ref; + } + } + + return true; +} + bool InterleavedAccessImpl::lowerInterleavedLoad( Instruction *LoadOp, SmallSetVector &DeadInsts) { if (isa(LoadOp->getType())) @@ -261,8 +286,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( } else if (auto *VPLoad = dyn_cast(LoadOp)) { assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); // Require a constant mask and evl. - if (!isa(VPLoad->getArgOperand(1)) || - !isa(VPLoad->getArgOperand(2))) + if (!isa(VPLoad->getArgOperand(1))) return false; } else { llvm_unreachable("unsupported load operation"); @@ -315,24 +339,6 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( NumLoadElements)) return false; - // If this is a vp.load, record its mask (NOT shuffle mask). - BitVector MaskedIndices(NumLoadElements); - if (auto *VPLoad = dyn_cast(LoadOp)) { - auto *Mask = cast(VPLoad->getArgOperand(1)); - assert(cast(Mask->getType())->getNumElements() == - NumLoadElements); - if (auto *Splat = Mask->getSplatValue()) { - // All-zeros mask, bail out early. - if (Splat->isZeroValue()) - return false; - } else { - for (unsigned i = 0U; i < NumLoadElements; ++i) { - if (Mask->getAggregateElement(i)->isZeroValue()) - MaskedIndices.set(i); - } - } - } - // Holds the corresponding index for each DE-interleave shuffle. SmallVector Indices; @@ -373,48 +379,35 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( bool BinOpShuffleChanged = replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp); - // Check if we extract only the unmasked elements. - if (MaskedIndices.any()) { - if (any_of(Shuffles, [&](const auto *Shuffle) { - ArrayRef ShuffleMask = Shuffle->getShuffleMask(); - for (int Idx : ShuffleMask) { - if (Idx < 0) - continue; - if (MaskedIndices.test(unsigned(Idx))) - return true; - } - return false; - })) { - LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through " - << "shufflevector\n"); - return false; - } - } - // Check if we extract only the elements within evl. + // Check if the de-interleaved vp.load masks are the same. + unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size(); + SmallVector LaneMask(ShuffleMaskLen, nullptr); if (auto *VPLoad = dyn_cast(LoadOp)) { - uint64_t EVL = cast(VPLoad->getArgOperand(2))->getZExtValue(); - if (any_of(Shuffles, [&](const auto *Shuffle) { - ArrayRef ShuffleMask = Shuffle->getShuffleMask(); - for (int Idx : ShuffleMask) { - if (Idx < 0) - continue; - if (unsigned(Idx) >= EVL) - return true; - } - return false; - })) { - LLVM_DEBUG( - dbgs() << "IA: trying to extract an element out of EVL range\n"); + if (!isInterleavedConstantMask( + Factor, cast(VPLoad->getArgOperand(1)), LaneMask)) return false; - } } LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n"); - // Try to create target specific intrinsics to replace the load and shuffles. - if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) { - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; + if (auto *VPLoad = dyn_cast(LoadOp)) { + auto *MaskVec = ConstantVector::get(LaneMask); + // Sometimes the number of Shuffles might be less than Factor, we have to + // fill the gaps with null. Also, lowerDeinterleavedVPLoad + // expects them to be sorted. + SmallVector ShuffleValues(Factor, nullptr); + for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) + ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; + if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; + } else { + // Try to create target specific intrinsics to replace the load and + // shuffles. + if (!TLI->lowerInterleavedLoad(cast(LoadOp), Shuffles, Indices, + Factor)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; } DeadInsts.insert_range(Shuffles); @@ -530,9 +523,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore( StoredValue = SI->getValueOperand(); } else if (auto *VPStore = dyn_cast(StoreOp)) { assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); - // Require a constant mask and evl. - if (!isa(VPStore->getArgOperand(2)) || - !isa(VPStore->getArgOperand(3))) + // Require a constant mask. + if (!isa(VPStore->getArgOperand(2))) return false; StoredValue = VPStore->getArgOperand(0); } else { @@ -545,53 +537,53 @@ bool InterleavedAccessImpl::lowerInterleavedStore( unsigned NumStoredElements = cast(SVI->getType())->getNumElements(); - // If this is a vp.store, record its mask (NOT shuffle mask). - BitVector MaskedIndices(NumStoredElements); - if (auto *VPStore = dyn_cast(StoreOp)) { - auto *Mask = cast(VPStore->getArgOperand(2)); - assert(cast(Mask->getType())->getNumElements() == - NumStoredElements); - if (auto *Splat = Mask->getSplatValue()) { - // All-zeros mask, bail out early. - if (Splat->isZeroValue()) - return false; - } else { - for (unsigned i = 0U; i < NumStoredElements; ++i) { - if (Mask->getAggregateElement(i)->isZeroValue()) - MaskedIndices.set(i); - } - } - } - // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; + assert(NumStoredElements % Factor == 0 && + "number of stored element should be a multiple of Factor"); - // Check if we store only the unmasked elements. - if (MaskedIndices.any()) { - if (any_of(SVI->getShuffleMask(), [&](int Idx) { - return Idx >= 0 && MaskedIndices.test(unsigned(Idx)); - })) { - LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n"); - return false; - } - } - // Check if we store only the elements within evl. + // Check if the de-interleaved vp.store masks are the same. + unsigned LaneMaskLen = NumStoredElements / Factor; + SmallVector LaneMask(LaneMaskLen, nullptr); if (auto *VPStore = dyn_cast(StoreOp)) { - uint64_t EVL = cast(VPStore->getArgOperand(3))->getZExtValue(); - if (any_of(SVI->getShuffleMask(), - [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) { - LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n"); + if (!isInterleavedConstantMask( + Factor, cast(VPStore->getArgOperand(2)), LaneMask)) return false; - } } LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n"); - // Try to create target specific intrinsics to replace the store and shuffle. - if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor)) - return false; + if (auto *VPStore = dyn_cast(StoreOp)) { + IRBuilder<> Builder(VPStore); + // We need to effectively de-interleave the shufflemask + // because lowerInterleavedVPStore expected individual de-interleaved + // values. + SmallVector NewShuffles; + SmallVector NewShuffleMask(LaneMaskLen); + auto ShuffleMask = SVI->getShuffleMask(); + + for (unsigned i = 0; i < Factor; i++) { + for (unsigned j = 0; j < LaneMaskLen; j++) + NewShuffleMask[j] = ShuffleMask[i + Factor * j]; + + NewShuffles.push_back(Builder.CreateShuffleVector( + SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask)); + } + + // Try to create target specific intrinsics to replace the vp.store and + // shuffle. + if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask), + NewShuffles)) + // We already created new shuffles. + return true; + } else { + // Try to create target specific intrinsics to replace the store and + // shuffle. + if (!TLI->lowerInterleavedStore(cast(StoreOp), SVI, Factor)) + return false; + } // Already have a new target specific interleaved store. Erase the old store. DeadInsts.insert(StoreOp); @@ -806,8 +798,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask, - DeinterleaveValues)) + if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues)) return false; } else { @@ -859,8 +850,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( // Since lowerInterleavedStore expects Shuffle and StoreInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask, - InterleaveValues)) + if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues)) return false; } else { auto *SI = cast(StoredBy); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d74cc3161684d..e366d7cb54490 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - Instruction *LoadOp, ArrayRef Shuffles, + LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17184,10 +17184,6 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - auto *LI = dyn_cast(LoadOp); - if (!LI) - return false; - const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); @@ -17363,17 +17359,13 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) -bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp, +bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - auto *SI = dyn_cast(StoreOp); - if (!SI) - return false; - auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 34446abb1474c..0d51ef2be8631 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(Instruction *LoadOp, + bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 64d12a0eb1d9b..2290ac2728c6d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - Instruction *LoadOp, ArrayRef Shuffles, + LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21616,10 +21616,6 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - auto *LI = dyn_cast(LoadOp); - if (!LI) - return false; - auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); @@ -21754,16 +21750,12 @@ bool ARMTargetLowering::lowerInterleavedLoad( /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) -bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp, +bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - auto *SI = dyn_cast(StoreOp); - if (!SI) - return false; - auto *VecTy = cast(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 635a6cd226936..9fad056edd3f1 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -673,11 +673,11 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(Instruction *LoadOp, + bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9558783963500..df545f5e06c18 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1731,6 +1731,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_seg6_load: case Intrinsic::riscv_seg7_load: case Intrinsic::riscv_seg8_load: + case Intrinsic::riscv_seg2_load_mask: + case Intrinsic::riscv_seg3_load_mask: + case Intrinsic::riscv_seg4_load_mask: + case Intrinsic::riscv_seg5_load_mask: + case Intrinsic::riscv_seg6_load_mask: + case Intrinsic::riscv_seg7_load_mask: + case Intrinsic::riscv_seg8_load_mask: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_seg2_store: @@ -1744,6 +1751,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2, /*IsStore*/ true, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + // Operands are (vec, ..., vec, ptr, mask, vl) + return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3, + /*IsStore*/ true, + /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_vle: case Intrinsic::riscv_vle_mask: case Intrinsic::riscv_vleff: @@ -10450,13 +10468,20 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::riscv_seg5_load: case Intrinsic::riscv_seg6_load: case Intrinsic::riscv_seg7_load: - case Intrinsic::riscv_seg8_load: { + case Intrinsic::riscv_seg8_load: + case Intrinsic::riscv_seg2_load_mask: + case Intrinsic::riscv_seg3_load_mask: + case Intrinsic::riscv_seg4_load_mask: + case Intrinsic::riscv_seg5_load_mask: + case Intrinsic::riscv_seg6_load_mask: + case Intrinsic::riscv_seg7_load_mask: + case Intrinsic::riscv_seg8_load_mask: { SDLoc DL(Op); static const Intrinsic::ID VlsegInts[7] = { - Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, - Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5, - Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, - Intrinsic::riscv_vlseg8}; + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask}; unsigned NF = Op->getNumValues() - 1; assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); MVT XLenVT = Subtarget.getXLenVT(); @@ -10466,7 +10491,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, ContainerVT.getScalarSizeInBits(); EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + // Masked: (pointer, mask, vl) + // Non-masked: (pointer, vl) + bool IsMasked = Op.getNumOperands() > 4; + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = + IsMasked ? Op.getOperand(3) : getAllOnesMask(ContainerVT, VL, DL, DAG); + MVT MaskVT = Mask.getSimpleValueType(); + if (MaskVT.isFixedLengthVector()) { + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + } + SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT); auto *Load = cast(Op); @@ -10476,7 +10513,10 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, IntID, DAG.getUNDEF(VecTupTy), Op.getOperand(2), + Mask, VL, + DAG.getTargetConstant( + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT), DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, @@ -10536,15 +10576,39 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::riscv_seg5_store: case Intrinsic::riscv_seg6_store: case Intrinsic::riscv_seg7_store: - case Intrinsic::riscv_seg8_store: { + case Intrinsic::riscv_seg8_store: + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: { SDLoc DL(Op); static const Intrinsic::ID VssegInts[] = { - Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, - Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, - Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, - Intrinsic::riscv_vsseg8}; - // Operands are (chain, int_id, vec*, ptr, vl) - unsigned NF = Op->getNumOperands() - 4; + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + + bool IsMasked = false; + switch (IntNo) { + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + IsMasked = true; + break; + default: + break; + } + + // Non-masked: (chain, int_id, vec*, ptr, vl) + // Masked: (chain, int_id, vec*, ptr, mask, vl) + unsigned NF = Op->getNumOperands() - (IsMasked ? 5 : 4); assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); MVT XLenVT = Subtarget.getXLenVT(); MVT VT = Op->getOperand(2).getSimpleValueType(); @@ -10553,7 +10617,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, ContainerVT.getScalarSizeInBits(); EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = IsMasked ? Op.getOperand(Op.getNumOperands() - 2) + : getAllOnesMask(ContainerVT, VL, DL, DAG); + MVT MaskVT = Mask.getSimpleValueType(); + if (MaskVT.isFixedLengthVector()) { + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + } + SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT); SDValue Ptr = Op->getOperand(NF + 2); @@ -10572,6 +10645,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, IntID, StoredVal, Ptr, + Mask, VL, DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; @@ -23376,36 +23450,19 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - Instruction *LoadOp, ArrayRef Shuffles, + LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - const DataLayout &DL = LoadOp->getDataLayout(); - IRBuilder<> Builder(LoadOp); + IRBuilder<> Builder(LI); auto *VTy = cast(Shuffles[0]->getType()); - - Align PtrAlignment; - unsigned PtrAddrSpace; - Value *BaseAddr; - if (auto *LI = dyn_cast(LoadOp)) { - BaseAddr = LI->getPointerOperand(); - PtrAlignment = LI->getAlign(); - PtrAddrSpace = LI->getPointerAddressSpace(); - } else { - auto *VPLoad = cast(LoadOp); - assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); - BaseAddr = VPLoad->getArgOperand(0); - PtrAlignment = VPLoad->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - PtrAddrSpace = cast(BaseAddr->getType())->getAddressSpace(); - } - - if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace, - DL)) + if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), + LI->getPointerAddressSpace(), + LI->getDataLayout())) return false; - auto *XLenTy = Type::getIntNTy(LoadOp->getContext(), Subtarget.getXLen()); + auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -23414,7 +23471,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8; Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset); + Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); Value *VL = Builder.getInt32(VTy->getNumElements()); @@ -23423,16 +23480,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad( {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), PtrAlignment)); + 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); Shuffles[0]->replaceAllUsesWith(CI); return true; }; Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - CallInst *VlsegN = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, BaseAddr->getType(), XLenTy}, - {BaseAddr, VL}); + CallInst *VlsegN = Builder.CreateIntrinsic( + FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy}, + {LI->getPointerOperand(), VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -23464,39 +23521,21 @@ static const Intrinsic::ID FixedVssegIntrIds[] = { /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vsseg3 instruction in CodeGen. -bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp, +bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { - const DataLayout &DL = StoreOp->getDataLayout(); - IRBuilder<> Builder(StoreOp); - + IRBuilder<> Builder(SI); auto Mask = SVI->getShuffleMask(); auto *ShuffleVTy = cast(SVI->getType()); // Given SVI : , then VTy : auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), ShuffleVTy->getNumElements() / Factor); - - Align PtrAlignment; - unsigned PtrAddrSpace; - Value *BaseAddr; - if (auto *SI = dyn_cast(StoreOp)) { - BaseAddr = SI->getPointerOperand(); - PtrAlignment = SI->getAlign(); - PtrAddrSpace = SI->getPointerAddressSpace(); - } else { - auto *VPStore = cast(StoreOp); - assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); - BaseAddr = VPStore->getArgOperand(1); - PtrAlignment = VPStore->getParamAlign(1).value_or( - DL.getABITypeAlign(VTy->getElementType())); - PtrAddrSpace = cast(BaseAddr->getType())->getAddressSpace(); - } - - if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace, - DL)) + if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), + SI->getPointerAddressSpace(), + SI->getDataLayout())) return false; - auto *XLenTy = Type::getIntNTy(StoreOp->getContext(), Subtarget.getXLen()); + auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); unsigned Index; // If the segment store only has one active lane (i.e. the interleave is @@ -23509,7 +23548,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp, auto *DataVTy = cast(Data->getType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset); + Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); Value *VL = Builder.getInt32(VTy->getNumElements()); @@ -23518,14 +23557,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp, {Data->getType(), BasePtr->getType(), Stride->getType()}, {Data, BasePtr, Stride, Mask, VL}); CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), PtrAlignment)); + 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); return true; } Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - StoreOp->getModule(), FixedVssegIntrIds[Factor - 2], - {VTy, BaseAddr->getType(), XLenTy}); + SI->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, SI->getPointerOperandType(), XLenTy}); SmallVector Ops; SmallVector NewShuffleMask; @@ -23545,7 +23584,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp, // potentially under larger LMULs) because we checked that the fixed vector // type fits in isLegalInterleavedAccessType Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - Ops.append({BaseAddr, VL}); + Ops.append({SI->getPointerOperand(), VL}); Builder.CreateCall(VssegNFunc, Ops); @@ -23687,15 +23726,20 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { if (N == 1) return true; + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_c_Mul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + if (isPowerOf2_32(N)) { KnownBits KB = llvm::computeKnownBits(V, DL); return KB.countMinTrailingZeros() >= Log2_32(N); } - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; + return false; } /// Lower an interleaved vp.load into a vlsegN intrinsic. @@ -23727,7 +23771,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// TODO: We probably can loosen the dependency on matching extractvalue when /// dealing with factor of 2 (extractvalue is still required for most of other /// factors though). -bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( +bool RISCVTargetLowering::lowerDeinterleavedVPLoad( VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveResults) const { assert(Mask && "Expect a valid mask"); @@ -23736,26 +23780,20 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( const unsigned Factor = DeinterleaveResults.size(); - auto *WideVTy = dyn_cast(Load->getType()); - // TODO: Support fixed vectors. - if (!WideVTy) + auto *VTy = dyn_cast(DeinterleaveResults[0]->getType()); + if (!VTy) return false; - unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue(); - assert(WideNumElements % Factor == 0 && - "ElementCount of a wide load must be divisible by interleave factor"); - auto *VTy = - VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor, - WideVTy->isScalableTy()); auto &DL = Load->getModule()->getDataLayout(); Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(WideVTy->getElementType())); + DL.getABITypeAlign(VTy->getElementType())); if (!isLegalInterleavedAccessType( VTy, Factor, Alignment, Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) return false; IRBuilder<> Builder(Load); + Value *WideEVL = Load->getArgOperand(2); // Conservatively check if EVL is a multiple of factor, otherwise some // (trailing) elements might be lost after the transformation. @@ -23767,49 +23805,64 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); - static const Intrinsic::ID IntrMaskIds[] = { - Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, - Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, - Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, - Intrinsic::riscv_vlseg8_mask, - }; + Value *Return = nullptr; + if (auto *FVTy = dyn_cast(VTy)) { + static const Intrinsic::ID FixedMaskedVlsegIntrIds[] = { + Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask, + Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask, + Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, + Intrinsic::riscv_seg8_load_mask}; + + Return = Builder.CreateIntrinsic(FixedMaskedVlsegIntrIds[Factor - 2], + {FVTy, XLenTy}, + {Load->getArgOperand(0), Mask, EVL}); + } else { + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask, + }; - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), + NumElts * SEW / 8), + Factor); - Value *PoisonVal = PoisonValue::get(VecTupTy); + Value *PoisonVal = PoisonValue::get(VecTupTy); - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), IntrMaskIds[Factor - 2], - {VecTupTy, Mask->getType(), EVL->getType()}); + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); - Value *Operands[] = {PoisonVal, - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, RISCVVType::TAIL_AGNOSTIC | - RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; + Value *Operands[] = { + PoisonVal, + Load->getArgOperand(0), + Mask, + EVL, + ConstantInt::get(XLenTy, + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); + CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - SmallVector AggrTypes{Factor, VTy}; - Value *Return = - PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); + SmallVector AggrTypes{Factor, VTy}; + Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = + Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } } for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { + if (!DIO) + continue; // We have to create a brand new ExtractValue to replace each // of these old ExtractValue instructions. Value *NewEV = @@ -23840,7 +23893,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( /// %load2, ptr %ptr, /// %mask, /// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( +bool RISCVTargetLowering::lowerInterleavedVPStore( VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOperands) const { assert(Mask && "Expect a valid mask"); @@ -23849,8 +23902,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( const unsigned Factor = InterleaveOperands.size(); - auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); - // TODO: Support fixed vectors. + auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); if (!VTy) return false; @@ -23874,6 +23926,21 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); + if (auto *FVTy = dyn_cast(VTy)) { + static const Intrinsic::ID FixedMaskedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + + SmallVector Operands(InterleaveOperands.begin(), + InterleaveOperands.end()); + Operands.append({Store->getArgOperand(1), Mask, EVL}); + Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy}, + Operands); + return true; + } + static const Intrinsic::ID IntrMaskIds[] = { Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 5df510c08e0da..8413a60b56cda 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -903,12 +903,12 @@ class RISCVTargetLowering : public TargetLowering { bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(Instruction *LoadOp, + bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; - bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad( @@ -917,13 +917,12 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; - bool lowerDeinterleavedIntrinsicToVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; + bool + lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const override; - bool lowerInterleavedIntrinsicToVPStore( - VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const override; + bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const override; bool supportKCFIBundles() const override { return true; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index da5baaf1bf5af..4a2b35e9efe7c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1650,14 +1650,14 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(Instruction *LoadOp, + bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; /// Lower interleaved store(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI, + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 99fbc249a1179..1eb47e3b2cd18 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - Instruction *LoadOp, ArrayRef Shuffles, + LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,10 +809,6 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - auto *LI = dyn_cast(LoadOp); - if (!LI) - return false; - // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, @@ -821,7 +817,7 @@ bool X86TargetLowering::lowerInterleavedLoad( return Grp.isSupported() && Grp.lowerIntoOptimizedSequence(); } -bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp, +bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && @@ -831,10 +827,6 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp, 0 && "Invalid interleaved store"); - auto *SI = dyn_cast(StoreOp); - if (!SI) - return false; - // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. auto Mask = SVI->getShuffleMask(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index dc7ca2317ec9a..0eb2fc3152b64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -224,6 +224,23 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -350,8 +367,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI16_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI16_0) +; RV32-NEXT: lui a7, %hi(.LCPI17_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI17_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -436,12 +453,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI16_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_1) +; RV32-NEXT: lui a1, %hi(.LCPI17_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI16_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI16_3) +; RV32-NEXT: lui a4, %hi(.LCPI17_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI17_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -620,8 +637,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI16_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_2) +; RV32-NEXT: lui a1, %hi(.LCPI17_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -685,16 +702,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI16_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_4) -; RV32-NEXT: lui a2, %hi(.LCPI16_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI16_5) +; RV32-NEXT: lui a1, %hi(.LCPI17_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_4) +; RV32-NEXT: lui a2, %hi(.LCPI17_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI17_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI16_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_7) +; RV32-NEXT: lui a1, %hi(.LCPI17_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -722,14 +739,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI16_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_6) -; RV32-NEXT: lui a2, %hi(.LCPI16_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI16_8) +; RV32-NEXT: lui a1, %hi(.LCPI17_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_6) +; RV32-NEXT: lui a2, %hi(.LCPI17_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI17_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI16_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI16_9) +; RV32-NEXT: lui a1, %hi(.LCPI17_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI17_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -816,8 +833,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI16_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI16_0) +; RV64-NEXT: lui a3, %hi(.LCPI17_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI17_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1005,8 +1022,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI16_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI16_1) +; RV64-NEXT: lui a2, %hi(.LCPI17_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI17_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1040,8 +1057,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI16_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI16_2) +; RV64-NEXT: lui a2, %hi(.LCPI17_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI17_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1125,12 +1142,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI16_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI16_3) +; RV64-NEXT: lui a1, %hi(.LCPI17_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI17_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI16_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI16_4) +; RV64-NEXT: lui a1, %hi(.LCPI17_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI17_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1181,8 +1198,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI16_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI16_5) +; RV64-NEXT: lui a1, %hi(.LCPI17_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI17_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1378,6 +1395,20 @@ define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> % ret void } +define void @vpstore_factor3_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vsseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> , i32 12) + ret void +} + define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { ; CHECK-LABEL: vpstore_factor4: ; CHECK: # %bb.0: @@ -1604,7 +1635,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vmv.s.x v11, a1 ; RV32-NEXT: lui a1, 1 ; RV32-NEXT: vmv.v.i v10, 8 -; RV32-NEXT: addi a1, a1, -33 +; RV32-NEXT: addi a1, a1, -43 ; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: li a1, 146 @@ -1612,8 +1643,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI40_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI40_0) +; RV32-NEXT: lui a1, %hi(.LCPI42_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI42_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1639,7 +1670,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV64-NEXT: vmv.s.x v20, a1 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vmv.v.i v10, 8 -; RV64-NEXT: addi a1, a1, -33 +; RV64-NEXT: addi a1, a1, -43 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: li a1, 36 @@ -1667,7 +1698,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV64-NEXT: vrgatherei16.vv v10, v12, v9 ; RV64-NEXT: vmv1r.v v9, v14 ; RV64-NEXT: ret - %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -1688,8 +1719,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI41_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI41_0) +; RV32-NEXT: lui a0, %hi(.LCPI43_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI43_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index d6e1af59e6341..d0f35aa8b85e9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -616,59 +616,6 @@ define void @not_balanced_store_tree( %v0, ret void } -; We only support scalable vectors for now. -define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) { -; RV32-LABEL: not_scalable_vectors: -; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v12, v8, a0 -; RV32-NEXT: vnsrl.wi v11, v8, 0 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vnsrl.wx v10, v11, a0 -; RV32-NEXT: vnsrl.wi v8, v11, 0 -; RV32-NEXT: vnsrl.wx v11, v12, a0 -; RV32-NEXT: vnsrl.wi v9, v12, 0 -; RV32-NEXT: ret -; -; RV64-LABEL: not_scalable_vectors: -; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 34 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v12, v8, a0 -; RV64-NEXT: vnsrl.wi v11, v8, 0 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vnsrl.wx v10, v11, a0 -; RV64-NEXT: vnsrl.wi v8, v11, 0 -; RV64-NEXT: vnsrl.wx v11, v12, a0 -; RV64-NEXT: vnsrl.wi v9, v12, 0 -; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 - %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl) - %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) - %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 - %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 - %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) - %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 - %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 - %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) - %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 - %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 - - %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 - %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 - %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 - %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 - ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 -} - define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { ; RV32-LABEL: not_same_mask: ; RV32: # %bb.0: From 9fd8830f6a368b2019633964492a932f0c4b7d70 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 18 Apr 2025 10:37:21 -0700 Subject: [PATCH 03/12] fixup! Minor fixes and add more tests --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 1 - .../rvv/fixed-vectors-interleaved-access.ll | 107 ++++++++++++------ 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 4bf05d06256e0..ce54eb130bb4b 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -45,7 +45,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 0eb2fc3152b64..3df75060808b1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -207,6 +207,22 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +; We only extract some of the fields. +define {<4 x i32>, <4 x i32>} @vpload_factor3_partial(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_partial: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} poison, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v2, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + ; Load a larger vector but only deinterleave a subset of the elements. define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) { ; CHECK-LABEL: vpload_factor3_v16i32: @@ -224,6 +240,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +; Make sure the mask is propagated. define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) { ; CHECK-LABEL: vpload_factor3_mask: ; CHECK: # %bb.0: @@ -241,6 +258,24 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 } +; Poison/undef in the shuffle mask shouldn't affect anything. +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_poison_shufflemask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ; CHECK-LABEL: vpload_factor4: ; CHECK: # %bb.0: @@ -367,8 +402,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI17_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI17_0) +; RV32-NEXT: lui a7, %hi(.LCPI19_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI19_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -453,12 +488,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI17_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_1) +; RV32-NEXT: lui a1, %hi(.LCPI19_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI17_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI17_3) +; RV32-NEXT: lui a4, %hi(.LCPI19_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI19_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -637,8 +672,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI17_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_2) +; RV32-NEXT: lui a1, %hi(.LCPI19_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -702,16 +737,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI17_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_4) -; RV32-NEXT: lui a2, %hi(.LCPI17_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI17_5) +; RV32-NEXT: lui a1, %hi(.LCPI19_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_4) +; RV32-NEXT: lui a2, %hi(.LCPI19_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI19_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI17_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_7) +; RV32-NEXT: lui a1, %hi(.LCPI19_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -739,14 +774,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI17_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_6) -; RV32-NEXT: lui a2, %hi(.LCPI17_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI17_8) +; RV32-NEXT: lui a1, %hi(.LCPI19_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_6) +; RV32-NEXT: lui a2, %hi(.LCPI19_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI19_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI17_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI17_9) +; RV32-NEXT: lui a1, %hi(.LCPI19_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI19_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -833,8 +868,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI17_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI17_0) +; RV64-NEXT: lui a3, %hi(.LCPI19_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI19_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1022,8 +1057,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI17_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI17_1) +; RV64-NEXT: lui a2, %hi(.LCPI19_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI19_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1057,8 +1092,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI17_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI17_2) +; RV64-NEXT: lui a2, %hi(.LCPI19_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI19_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1142,12 +1177,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI17_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI17_3) +; RV64-NEXT: lui a1, %hi(.LCPI19_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI19_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI17_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI17_4) +; RV64-NEXT: lui a1, %hi(.LCPI19_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI19_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1198,8 +1233,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI17_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI17_5) +; RV64-NEXT: lui a1, %hi(.LCPI19_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI19_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1643,8 +1678,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI42_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI42_0) +; RV32-NEXT: lui a1, %hi(.LCPI44_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI44_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1719,8 +1754,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI43_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI43_0) +; RV32-NEXT: lui a0, %hi(.LCPI45_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI45_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 From ae3113201afc3e1d3fe656a969a0def0dc44a1ba Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 18 Apr 2025 13:41:24 -0700 Subject: [PATCH 04/12] fixup! Address reviewer comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index ce54eb130bb4b..66c1cd98ef602 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -249,22 +249,17 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } -/// Return true if it's a non-all-zeros, interleaving mask. For instance, -/// 111000111000 is interleaved from three 1010 masks. -/// \p SubMask returns the mask of individual lane. +/// Return true if it's an interleaving mask. For instance, 111000111000 is +/// interleaved from three 1010 masks. \p SubMask returns the mask of individual +/// lane. static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask, SmallVectorImpl &LaneMask) { unsigned LaneMaskLen = LaneMask.size(); if (auto *Splat = Mask->getSplatValue()) { - // All-zeros mask. - if (Splat->isZeroValue()) - return false; - // All-ones mask. - std::fill(LaneMask.begin(), LaneMask.end(), - ConstantInt::getTrue(Mask->getContext())); + std::fill(LaneMask.begin(), LaneMask.end(), Splat); } else { for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) { - Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor); + Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor)); if (Ref != Mask->getAggregateElement(Idx)) return false; LaneMask[Idx / Factor] = Ref; From 4a8852f8971dc78d194bd4ac736f077ef0de8415 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Mon, 21 Apr 2025 10:34:55 -0700 Subject: [PATCH 05/12] fixup! Address review comments --- .../rvv/fixed-vectors-interleaved-access.ll | 97 ++++++++++++------- 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 3df75060808b1..c0a6456f01075 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -294,6 +294,31 @@ define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3 } +; TODO: Add more tests for vp.load/store + (de)interleave intrinsics with fixed vectors. +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(ptr %ptr) { +; CHECK-LABEL: vpload_factor4_intrinsics: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8) + %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) + %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 + %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 + %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) + %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 + %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 + %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) + %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 + %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 + + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 + %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 + %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 + %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 +} + define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) { ; CHECK-LABEL: vpload_factor5: ; CHECK: # %bb.0: @@ -402,8 +427,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI19_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI19_0) +; RV32-NEXT: lui a7, %hi(.LCPI20_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI20_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -488,12 +513,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI19_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_1) +; RV32-NEXT: lui a1, %hi(.LCPI20_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI19_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI19_3) +; RV32-NEXT: lui a4, %hi(.LCPI20_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI20_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -672,8 +697,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI19_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_2) +; RV32-NEXT: lui a1, %hi(.LCPI20_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -737,16 +762,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI19_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_4) -; RV32-NEXT: lui a2, %hi(.LCPI19_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI19_5) +; RV32-NEXT: lui a1, %hi(.LCPI20_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_4) +; RV32-NEXT: lui a2, %hi(.LCPI20_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI20_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI19_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_7) +; RV32-NEXT: lui a1, %hi(.LCPI20_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -774,14 +799,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI19_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_6) -; RV32-NEXT: lui a2, %hi(.LCPI19_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI19_8) +; RV32-NEXT: lui a1, %hi(.LCPI20_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_6) +; RV32-NEXT: lui a2, %hi(.LCPI20_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI20_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI19_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI19_9) +; RV32-NEXT: lui a1, %hi(.LCPI20_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -868,8 +893,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI19_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI19_0) +; RV64-NEXT: lui a3, %hi(.LCPI20_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI20_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -1057,8 +1082,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI19_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI19_1) +; RV64-NEXT: lui a2, %hi(.LCPI20_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI20_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -1092,8 +1117,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI19_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI19_2) +; RV64-NEXT: lui a2, %hi(.LCPI20_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI20_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -1177,12 +1202,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI19_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI19_3) +; RV64-NEXT: lui a1, %hi(.LCPI20_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI19_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI19_4) +; RV64-NEXT: lui a1, %hi(.LCPI20_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1233,8 +1258,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI19_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI19_5) +; RV64-NEXT: lui a1, %hi(.LCPI20_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1678,8 +1703,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI44_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI44_0) +; RV32-NEXT: lui a1, %hi(.LCPI45_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI45_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1754,8 +1779,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI45_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI45_0) +; RV32-NEXT: lui a0, %hi(.LCPI46_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI46_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 From 5a09f293f97c2f9d3717212f0731c6c948ed494d Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 25 Apr 2025 10:35:45 -0700 Subject: [PATCH 06/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 422 ++++++++++---------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +- 2 files changed, 211 insertions(+), 214 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 66c1cd98ef602..06598d14e8db0 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -249,26 +249,200 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } -/// Return true if it's an interleaving mask. For instance, 111000111000 is -/// interleaved from three 1010 masks. \p SubMask returns the mask of individual -/// lane. -static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask, - SmallVectorImpl &LaneMask) { - unsigned LaneMaskLen = LaneMask.size(); - if (auto *Splat = Mask->getSplatValue()) { - std::fill(LaneMask.begin(), LaneMask.end(), Splat); - } else { - for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) { - Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor)); - if (Ref != Mask->getAggregateElement(Idx)) +// For an (de)interleave tree like this: +// +// A C B D +// |___| |___| +// |_____| +// | +// A B C D +// +// We will get ABCD at the end while the leaf operands/results +// are ACBD, which are also what we initially collected in +// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI +// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need +// to reorder them by interleaving these values. +static void interleaveLeafValues(MutableArrayRef SubLeaves) { + unsigned NumLeaves = SubLeaves.size(); + if (NumLeaves == 2) + return; + + assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); + + const unsigned HalfLeaves = NumLeaves / 2; + // Visit the sub-trees. + interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); + interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); + + SmallVector Buffer; + // a0 a1 a2 a3 b0 b1 b2 b3 + // -> a0 b0 a1 b1 a2 b2 a3 b3 + for (unsigned i = 0U; i < NumLeaves; ++i) + Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); + + llvm::copy(Buffer, SubLeaves.begin()); +} + +static bool +getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + + // All the intermediate intrinsics will be deleted. + DeadInsts.push_back(Current); + + for (unsigned I = 0; I < 2; ++I) { + Value *Op = Current->getOperand(I); + if (auto *OpII = dyn_cast(Op)) + if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { + Queue.push_back(OpII); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Operands.empty() && Op->getType() != Operands.back()->getType()) + return false; + + Operands.push_back(Op); + } + } + + const unsigned Factor = Operands.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return false; + + interleaveLeafValues(Operands); + return true; +} + +static bool +getVectorDeinterleaveFactor(IntrinsicInst *II, + SmallVectorImpl &Results, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); + using namespace PatternMatch; + if (!II->hasNUses(2)) + return false; + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + assert(Current->hasNUses(2)); + + // All the intermediate intrinsics will be deleted from the bottom-up. + DeadInsts.insert(DeadInsts.begin(), Current); + + ExtractValueInst *LHS = nullptr, *RHS = nullptr; + for (User *Usr : Current->users()) { + if (!isa(Usr)) + return 0; + + auto *EV = cast(Usr); + // Intermediate ExtractValue instructions will also be deleted. + DeadInsts.insert(DeadInsts.begin(), EV); + ArrayRef Indices = EV->getIndices(); + if (Indices.size() != 1) + return false; + + if (Indices[0] == 0 && !LHS) + LHS = EV; + else if (Indices[0] == 1 && !RHS) + RHS = EV; + else + return false; + } + + // We have legal indices. At this point we're either going + // to continue the traversal or push the leaf values into Results. + for (ExtractValueInst *EV : {LHS, RHS}) { + // Continue the traversal. We're playing safe here and matching only the + // expression consisting of a perfectly balanced binary tree in which all + // intermediate values are only used once. + if (EV->hasOneUse() && + match(EV->user_back(), + m_Intrinsic()) && + EV->user_back()->hasNUses(2)) { + auto *EVUsr = cast(EV->user_back()); + Queue.push_back(EVUsr); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Results.empty() && EV->getType() != Results.back()->getType()) return false; - LaneMask[Idx / Factor] = Ref; + + // Save the leaf value. + Results.push_back(EV); } } + const unsigned Factor = Results.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return 0; + + interleaveLeafValues(Results); return true; } +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +static Value *getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { + using namespace llvm::PatternMatch; + if (auto *IMI = dyn_cast(WideMask)) { + SmallVector Operands; + SmallVector DeadInsts; + if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { + assert(!Operands.empty()); + if (Operands.size() == Factor && llvm::all_equal(Operands)) + return Operands[0]; + } + } + + if (auto *ConstMask = dyn_cast(WideMask)) { + if (auto *Splat = ConstMask->getSplatValue()) { + // All-ones or all-zeros mask. + return ConstantVector::getSplat(LeafValueEC, Splat); + } else if (LeafValueEC.isFixed()) { + unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + SmallVector LeafMask(LeafMaskLen, nullptr); + // If this is a fixed-length constant mask, each lane / leaf has to + // use the same mask. This is done by checking if every group with Factor + // number of elements in the interleaved mask has homogeneous values. + for (unsigned Idx = 0U, N = LeafMaskLen * Factor; Idx < N; ++Idx) { + Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor)); + if (Ref != ConstMask->getAggregateElement(Idx)) + return nullptr; + LeafMask[Idx / Factor] = Ref; + } + + return ConstantVector::get(LeafMask); + } + } + + return nullptr; +} +static Value *getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { + return getMask(WideMask, Factor, LeafValueTy->getElementCount()); +} + bool InterleavedAccessImpl::lowerInterleavedLoad( Instruction *LoadOp, SmallSetVector &DeadInsts) { if (isa(LoadOp->getType())) @@ -279,7 +453,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( return false; } else if (auto *VPLoad = dyn_cast(LoadOp)) { assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); - // Require a constant mask and evl. + // Require a constant mask. if (!isa(VPLoad->getArgOperand(1))) return false; } else { @@ -373,29 +547,27 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( bool BinOpShuffleChanged = replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp); - // Check if the de-interleaved vp.load masks are the same. - unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size(); - SmallVector LaneMask(ShuffleMaskLen, nullptr); if (auto *VPLoad = dyn_cast(LoadOp)) { - if (!isInterleavedConstantMask( - Factor, cast(VPLoad->getArgOperand(1)), LaneMask)) + Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, + cast(Shuffles[0]->getType())); + if (!LaneMask) return false; - } - LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *LoadOp + << "\n"); - if (auto *VPLoad = dyn_cast(LoadOp)) { - auto *MaskVec = ConstantVector::get(LaneMask); // Sometimes the number of Shuffles might be less than Factor, we have to // fill the gaps with null. Also, lowerDeinterleavedVPLoad // expects them to be sorted. SmallVector ShuffleValues(Factor, nullptr); for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues)) + if (!TLI->lowerDeinterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; } else { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n"); + // Try to create target specific intrinsics to replace the load and // shuffles. if (!TLI->lowerInterleavedLoad(cast(LoadOp), Shuffles, Indices, @@ -539,17 +711,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore( "number of stored element should be a multiple of Factor"); // Check if the de-interleaved vp.store masks are the same. - unsigned LaneMaskLen = NumStoredElements / Factor; - SmallVector LaneMask(LaneMaskLen, nullptr); if (auto *VPStore = dyn_cast(StoreOp)) { - if (!isInterleavedConstantMask( - Factor, cast(VPStore->getArgOperand(2)), LaneMask)) - return false; } - LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n"); - if (auto *VPStore = dyn_cast(StoreOp)) { + unsigned LaneMaskLen = NumStoredElements / Factor; + Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor, + ElementCount::getFixed(LaneMaskLen)); + if (!LaneMask) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *StoreOp + << "\n"); + IRBuilder<> Builder(VPStore); // We need to effectively de-interleave the shufflemask // because lowerInterleavedVPStore expected individual de-interleaved @@ -568,11 +742,13 @@ bool InterleavedAccessImpl::lowerInterleavedStore( // Try to create target specific intrinsics to replace the vp.store and // shuffle. - if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask), - NewShuffles)) + if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles)) // We already created new shuffles. return true; } else { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp + << "\n"); + // Try to create target specific intrinsics to replace the store and // shuffle. if (!TLI->lowerInterleavedStore(cast(StoreOp), SVI, Factor)) @@ -585,184 +761,6 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } -// For an (de)interleave tree like this: -// -// A C B D -// |___| |___| -// |_____| -// | -// A B C D -// -// We will get ABCD at the end while the leaf operands/results -// are ACBD, which are also what we initially collected in -// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI -// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need -// to reorder them by interleaving these values. -static void interleaveLeafValues(MutableArrayRef SubLeaves) { - unsigned NumLeaves = SubLeaves.size(); - if (NumLeaves == 2) - return; - - assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); - - const unsigned HalfLeaves = NumLeaves / 2; - // Visit the sub-trees. - interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); - interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); - - SmallVector Buffer; - // a0 a1 a2 a3 b0 b1 b2 b3 - // -> a0 b0 a1 b1 a2 b2 a3 b3 - for (unsigned i = 0U; i < NumLeaves; ++i) - Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); - - llvm::copy(Buffer, SubLeaves.begin()); -} - -static bool -getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, - SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - - // All the intermediate intrinsics will be deleted. - DeadInsts.push_back(Current); - - for (unsigned I = 0; I < 2; ++I) { - Value *Op = Current->getOperand(I); - if (auto *OpII = dyn_cast(Op)) - if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { - Queue.push_back(OpII); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Operands.empty() && Op->getType() != Operands.back()->getType()) - return false; - - Operands.push_back(Op); - } - } - - const unsigned Factor = Operands.size(); - // Currently we only recognize power-of-two factors. - // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) - return false; - - interleaveLeafValues(Operands); - return true; -} - -static bool -getVectorDeinterleaveFactor(IntrinsicInst *II, - SmallVectorImpl &Results, - SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); - using namespace PatternMatch; - if (!II->hasNUses(2)) - return false; - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - assert(Current->hasNUses(2)); - - // All the intermediate intrinsics will be deleted from the bottom-up. - DeadInsts.insert(DeadInsts.begin(), Current); - - ExtractValueInst *LHS = nullptr, *RHS = nullptr; - for (User *Usr : Current->users()) { - if (!isa(Usr)) - return 0; - - auto *EV = cast(Usr); - // Intermediate ExtractValue instructions will also be deleted. - DeadInsts.insert(DeadInsts.begin(), EV); - ArrayRef Indices = EV->getIndices(); - if (Indices.size() != 1) - return false; - - if (Indices[0] == 0 && !LHS) - LHS = EV; - else if (Indices[0] == 1 && !RHS) - RHS = EV; - else - return false; - } - - // We have legal indices. At this point we're either going - // to continue the traversal or push the leaf values into Results. - for (ExtractValueInst *EV : {LHS, RHS}) { - // Continue the traversal. We're playing safe here and matching only the - // expression consisting of a perfectly balanced binary tree in which all - // intermediate values are only used once. - if (EV->hasOneUse() && - match(EV->user_back(), - m_Intrinsic()) && - EV->user_back()->hasNUses(2)) { - auto *EVUsr = cast(EV->user_back()); - Queue.push_back(EVUsr); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Results.empty() && EV->getType() != Results.back()->getType()) - return false; - - // Save the leaf value. - Results.push_back(EV); - } - } - - const unsigned Factor = Results.size(); - // Currently we only recognize power-of-two factors. - // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) - return 0; - - interleaveLeafValues(Results); - return true; -} - -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. -static Value *getMask(Value *WideMask, unsigned Factor, - VectorType *LeafValueTy) { - using namespace llvm::PatternMatch; - if (auto *IMI = dyn_cast(WideMask)) { - SmallVector Operands; - SmallVector DeadInsts; - if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { - assert(!Operands.empty()); - if (Operands.size() == Factor && llvm::all_equal(Operands)) - return Operands[0]; - } - } - - if (match(WideMask, m_AllOnes())) { - // Scale the vector length of all-ones mask. - ElementCount OrigEC = - cast(WideMask->getType())->getElementCount(); - assert(OrigEC.getKnownMinValue() % Factor == 0); - return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), - cast(WideMask)->getSplatValue()); - } - - return nullptr; -} - bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector &DeadInsts) { Value *LoadedVal = DI->getOperand(0); @@ -780,7 +778,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (auto *VPLoad = dyn_cast(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) return false; - // Check mask operand. Handle both all-true and interleaved mask. + // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); Value *Mask = getMask(WideMask, Factor, cast(DeinterleaveValues[0]->getType())); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index df545f5e06c18..797ec6f8e0492 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23933,8 +23933,7 @@ bool RISCVTargetLowering::lowerInterleavedVPStore( Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, Intrinsic::riscv_seg8_store_mask}; - SmallVector Operands(InterleaveOperands.begin(), - InterleaveOperands.end()); + SmallVector Operands(InterleaveOperands); Operands.append({Store->getArgOperand(1), Mask, EVL}); Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy}, Operands); From a41bfb1ef6db37029e7184e41c62a45373175cf9 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 25 Apr 2025 11:56:30 -0700 Subject: [PATCH 07/12] fixup! Fix an error caused by the merge --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 49a8959b7e5dd..06598d14e8db0 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -780,7 +780,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( return false; // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(DeinterleaveValues[0]->getType())); if (!Mask) return false; @@ -831,7 +832,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( return false; Value *WideMask = VPStore->getOperand(2); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; From e46d311a2d1dde60640d25d99214f802a683daf7 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 30 Apr 2025 14:08:27 -0700 Subject: [PATCH 08/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 377 +++++++++++---------- 1 file changed, 190 insertions(+), 187 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 06598d14e8db0..9273e42e90716 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -249,195 +249,9 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } -// For an (de)interleave tree like this: -// -// A C B D -// |___| |___| -// |_____| -// | -// A B C D -// -// We will get ABCD at the end while the leaf operands/results -// are ACBD, which are also what we initially collected in -// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI -// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need -// to reorder them by interleaving these values. -static void interleaveLeafValues(MutableArrayRef SubLeaves) { - unsigned NumLeaves = SubLeaves.size(); - if (NumLeaves == 2) - return; - - assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); - - const unsigned HalfLeaves = NumLeaves / 2; - // Visit the sub-trees. - interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); - interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); - - SmallVector Buffer; - // a0 a1 a2 a3 b0 b1 b2 b3 - // -> a0 b0 a1 b1 a2 b2 a3 b3 - for (unsigned i = 0U; i < NumLeaves; ++i) - Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); - - llvm::copy(Buffer, SubLeaves.begin()); -} - -static bool -getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, - SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - - // All the intermediate intrinsics will be deleted. - DeadInsts.push_back(Current); - - for (unsigned I = 0; I < 2; ++I) { - Value *Op = Current->getOperand(I); - if (auto *OpII = dyn_cast(Op)) - if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { - Queue.push_back(OpII); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Operands.empty() && Op->getType() != Operands.back()->getType()) - return false; - - Operands.push_back(Op); - } - } - - const unsigned Factor = Operands.size(); - // Currently we only recognize power-of-two factors. - // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) - return false; - - interleaveLeafValues(Operands); - return true; -} - -static bool -getVectorDeinterleaveFactor(IntrinsicInst *II, - SmallVectorImpl &Results, - SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); - using namespace PatternMatch; - if (!II->hasNUses(2)) - return false; - - // Visit with BFS - SmallVector Queue; - Queue.push_back(II); - while (!Queue.empty()) { - IntrinsicInst *Current = Queue.front(); - Queue.erase(Queue.begin()); - assert(Current->hasNUses(2)); - - // All the intermediate intrinsics will be deleted from the bottom-up. - DeadInsts.insert(DeadInsts.begin(), Current); - - ExtractValueInst *LHS = nullptr, *RHS = nullptr; - for (User *Usr : Current->users()) { - if (!isa(Usr)) - return 0; - - auto *EV = cast(Usr); - // Intermediate ExtractValue instructions will also be deleted. - DeadInsts.insert(DeadInsts.begin(), EV); - ArrayRef Indices = EV->getIndices(); - if (Indices.size() != 1) - return false; - - if (Indices[0] == 0 && !LHS) - LHS = EV; - else if (Indices[0] == 1 && !RHS) - RHS = EV; - else - return false; - } - - // We have legal indices. At this point we're either going - // to continue the traversal or push the leaf values into Results. - for (ExtractValueInst *EV : {LHS, RHS}) { - // Continue the traversal. We're playing safe here and matching only the - // expression consisting of a perfectly balanced binary tree in which all - // intermediate values are only used once. - if (EV->hasOneUse() && - match(EV->user_back(), - m_Intrinsic()) && - EV->user_back()->hasNUses(2)) { - auto *EVUsr = cast(EV->user_back()); - Queue.push_back(EVUsr); - continue; - } - - // If this is not a perfectly balanced tree, the leaf - // result types would be different. - if (!Results.empty() && EV->getType() != Results.back()->getType()) - return false; - - // Save the leaf value. - Results.push_back(EV); - } - } - - const unsigned Factor = Results.size(); - // Currently we only recognize power-of-two factors. - // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) - return 0; - - interleaveLeafValues(Results); - return true; -} - -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. static Value *getMask(Value *WideMask, unsigned Factor, - ElementCount LeafValueEC) { - using namespace llvm::PatternMatch; - if (auto *IMI = dyn_cast(WideMask)) { - SmallVector Operands; - SmallVector DeadInsts; - if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { - assert(!Operands.empty()); - if (Operands.size() == Factor && llvm::all_equal(Operands)) - return Operands[0]; - } - } - - if (auto *ConstMask = dyn_cast(WideMask)) { - if (auto *Splat = ConstMask->getSplatValue()) { - // All-ones or all-zeros mask. - return ConstantVector::getSplat(LeafValueEC, Splat); - } else if (LeafValueEC.isFixed()) { - unsigned LeafMaskLen = LeafValueEC.getFixedValue(); - SmallVector LeafMask(LeafMaskLen, nullptr); - // If this is a fixed-length constant mask, each lane / leaf has to - // use the same mask. This is done by checking if every group with Factor - // number of elements in the interleaved mask has homogeneous values. - for (unsigned Idx = 0U, N = LeafMaskLen * Factor; Idx < N; ++Idx) { - Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor)); - if (Ref != ConstMask->getAggregateElement(Idx)) - return nullptr; - LeafMask[Idx / Factor] = Ref; - } + ElementCount LeafValueEC); - return ConstantVector::get(LeafMask); - } - } - - return nullptr; -} static Value *getMask(Value *WideMask, unsigned Factor, VectorType *LeafValueTy) { return getMask(WideMask, Factor, LeafValueTy->getElementCount()); @@ -761,6 +575,195 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +// For an (de)interleave tree like this: +// +// A C B D +// |___| |___| +// |_____| +// | +// A B C D +// +// We will get ABCD at the end while the leaf operands/results +// are ACBD, which are also what we initially collected in +// getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI +// hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need +// to reorder them by interleaving these values. +static void interleaveLeafValues(MutableArrayRef SubLeaves) { + unsigned NumLeaves = SubLeaves.size(); + if (NumLeaves == 2) + return; + + assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); + + const unsigned HalfLeaves = NumLeaves / 2; + // Visit the sub-trees. + interleaveLeafValues(SubLeaves.take_front(HalfLeaves)); + interleaveLeafValues(SubLeaves.drop_front(HalfLeaves)); + + SmallVector Buffer; + // a0 a1 a2 a3 b0 b1 b2 b3 + // -> a0 b0 a1 b1 a2 b2 a3 b3 + for (unsigned i = 0U; i < NumLeaves; ++i) + Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]); + + llvm::copy(Buffer, SubLeaves.begin()); +} + +static bool +getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + + // All the intermediate intrinsics will be deleted. + DeadInsts.push_back(Current); + + for (unsigned I = 0; I < 2; ++I) { + Value *Op = Current->getOperand(I); + if (auto *OpII = dyn_cast(Op)) + if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { + Queue.push_back(OpII); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Operands.empty() && Op->getType() != Operands.back()->getType()) + return false; + + Operands.push_back(Op); + } + } + + const unsigned Factor = Operands.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return false; + + interleaveLeafValues(Operands); + return true; +} + +static bool +getVectorDeinterleaveFactor(IntrinsicInst *II, + SmallVectorImpl &Results, + SmallVectorImpl &DeadInsts) { + assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); + using namespace PatternMatch; + if (!II->hasNUses(2)) + return false; + + // Visit with BFS + SmallVector Queue; + Queue.push_back(II); + while (!Queue.empty()) { + IntrinsicInst *Current = Queue.front(); + Queue.erase(Queue.begin()); + assert(Current->hasNUses(2)); + + // All the intermediate intrinsics will be deleted from the bottom-up. + DeadInsts.insert(DeadInsts.begin(), Current); + + ExtractValueInst *LHS = nullptr, *RHS = nullptr; + for (User *Usr : Current->users()) { + if (!isa(Usr)) + return 0; + + auto *EV = cast(Usr); + // Intermediate ExtractValue instructions will also be deleted. + DeadInsts.insert(DeadInsts.begin(), EV); + ArrayRef Indices = EV->getIndices(); + if (Indices.size() != 1) + return false; + + if (Indices[0] == 0 && !LHS) + LHS = EV; + else if (Indices[0] == 1 && !RHS) + RHS = EV; + else + return false; + } + + // We have legal indices. At this point we're either going + // to continue the traversal or push the leaf values into Results. + for (ExtractValueInst *EV : {LHS, RHS}) { + // Continue the traversal. We're playing safe here and matching only the + // expression consisting of a perfectly balanced binary tree in which all + // intermediate values are only used once. + if (EV->hasOneUse() && + match(EV->user_back(), + m_Intrinsic()) && + EV->user_back()->hasNUses(2)) { + auto *EVUsr = cast(EV->user_back()); + Queue.push_back(EVUsr); + continue; + } + + // If this is not a perfectly balanced tree, the leaf + // result types would be different. + if (!Results.empty() && EV->getType() != Results.back()->getType()) + return false; + + // Save the leaf value. + Results.push_back(EV); + } + } + + const unsigned Factor = Results.size(); + // Currently we only recognize power-of-two factors. + // FIXME: should we assert here instead? + if (Factor <= 1 || !isPowerOf2_32(Factor)) + return 0; + + interleaveLeafValues(Results); + return true; +} + +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +static Value *getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { + if (auto *IMI = dyn_cast(WideMask)) { + SmallVector Operands; + SmallVector DeadInsts; + if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) { + assert(!Operands.empty()); + if (Operands.size() == Factor && llvm::all_equal(Operands)) + return Operands[0]; + } + } + + if (auto *ConstMask = dyn_cast(WideMask)) { + if (auto *Splat = ConstMask->getSplatValue()) { + // All-ones or all-zeros mask. + return ConstantVector::getSplat(LeafValueEC, Splat); + } else if (LeafValueEC.isFixed()) { + unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + SmallVector LeafMask(LeafMaskLen, nullptr); + // If this is a fixed-length constant mask, each lane / leaf has to + // use the same mask. This is done by checking if every group with Factor + // number of elements in the interleaved mask has homogeneous values. + for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { + Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor)); + if (Ref != ConstMask->getAggregateElement(Idx)) + return nullptr; + LeafMask[Idx / Factor] = Ref; + } + + return ConstantVector::get(LeafMask); + } + } + + return nullptr; +} + bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( IntrinsicInst *DI, SmallSetVector &DeadInsts) { Value *LoadedVal = DI->getOperand(0); From b3838516c45848e7ec26fb9c20aa0c0fe482fc71 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 30 Apr 2025 14:10:29 -0700 Subject: [PATCH 09/12] fixup! fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 9273e42e90716..7b77efbcf0774 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -249,6 +249,8 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. static Value *getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC); @@ -726,8 +728,6 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. static Value *getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC) { if (auto *IMI = dyn_cast(WideMask)) { From 6c03ffa91dae4c625f6d2bc41eefbd4afe738f68 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 1 May 2025 19:09:53 -0700 Subject: [PATCH 10/12] fixup! Address review comments --- llvm/include/llvm/CodeGen/TargetLowering.h | 7 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 79 ++++++++++----------- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 5 +- 4 files changed, 43 insertions(+), 50 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3eee28741bf17..03099e9ad44dc 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3210,15 +3210,14 @@ class TargetLoweringBase { return false; } - /// Lower a deinterleaved load to target specific intrinsics. Return + /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// /// \p Load is a vp.load instruction. /// \p Mask is a mask value /// \p DeinterleaveRes is a list of deinterleaved results. - virtual bool - lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const { + virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 7b77efbcf0774..f70b8924ef21b 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -100,11 +100,11 @@ class InterleavedAccessImpl { unsigned MaxFactor = 0u; /// Transform an interleaved load into target specific intrinsics. - bool lowerInterleavedLoad(Instruction *LoadOp, + bool lowerInterleavedLoad(Instruction *Load, SmallSetVector &DeadInsts); /// Transform an interleaved store into target specific intrinsics. - bool lowerInterleavedStore(Instruction *StoreOp, + bool lowerInterleavedStore(Instruction *Store, SmallSetVector &DeadInsts); /// Transform a load and a deinterleave intrinsic into target specific @@ -260,14 +260,14 @@ static Value *getMask(Value *WideMask, unsigned Factor, } bool InterleavedAccessImpl::lowerInterleavedLoad( - Instruction *LoadOp, SmallSetVector &DeadInsts) { - if (isa(LoadOp->getType())) + Instruction *Load, SmallSetVector &DeadInsts) { + if (isa(Load->getType())) return false; - if (auto *LI = dyn_cast(LoadOp)) { + if (auto *LI = dyn_cast(Load)) { if (!LI->isSimple()) return false; - } else if (auto *VPLoad = dyn_cast(LoadOp)) { + } else if (auto *VPLoad = dyn_cast(Load)) { assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); // Require a constant mask. if (!isa(VPLoad->getArgOperand(1))) @@ -287,7 +287,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( // binop are the same load. SmallSetVector BinOpShuffles; - for (auto *User : LoadOp->users()) { + for (auto *User : Load->users()) { auto *Extract = dyn_cast(User); if (Extract && isa(Extract->getIndexOperand())) { Extracts.push_back(Extract); @@ -316,7 +316,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( unsigned Factor, Index; unsigned NumLoadElements = - cast(LoadOp->getType())->getNumElements(); + cast(Load->getType())->getNumElements(); auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; // Check if the first shufflevector is DE-interleave shuffle. if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, @@ -349,9 +349,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( assert(Shuffle->getShuffleMask().size() <= NumLoadElements); - if (cast(Shuffle->getOperand(0))->getOperand(0) == LoadOp) + if (cast(Shuffle->getOperand(0))->getOperand(0) == Load) Indices.push_back(Index); - if (cast(Shuffle->getOperand(0))->getOperand(1) == LoadOp) + if (cast(Shuffle->getOperand(0))->getOperand(1) == Load) Indices.push_back(Index); } @@ -361,32 +361,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( return false; bool BinOpShuffleChanged = - replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp); + replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); - if (auto *VPLoad = dyn_cast(LoadOp)) { + if (auto *VPLoad = dyn_cast(Load)) { Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, cast(Shuffles[0]->getType())); if (!LaneMask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *LoadOp - << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); // Sometimes the number of Shuffles might be less than Factor, we have to - // fill the gaps with null. Also, lowerDeinterleavedVPLoad + // fill the gaps with null. Also, lowerInterleavedVPLoad // expects them to be sorted. SmallVector ShuffleValues(Factor, nullptr); for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerDeinterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) + if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; } else { - LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); // Try to create target specific intrinsics to replace the load and // shuffles. - if (!TLI->lowerInterleavedLoad(cast(LoadOp), Shuffles, Indices, + if (!TLI->lowerInterleavedLoad(cast(Load), Shuffles, Indices, Factor)) // If Extracts is not empty, tryReplaceExtracts made changes earlier. return !Extracts.empty() || BinOpShuffleChanged; @@ -394,13 +393,13 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( DeadInsts.insert_range(Shuffles); - DeadInsts.insert(LoadOp); + DeadInsts.insert(Load); return true; } bool InterleavedAccessImpl::replaceBinOpShuffles( ArrayRef BinOpShuffles, - SmallVectorImpl &Shuffles, Instruction *LoadOp) { + SmallVectorImpl &Shuffles, Instruction *Load) { for (auto *SVI : BinOpShuffles) { BinaryOperator *BI = cast(SVI->getOperand(0)); Type *BIOp0Ty = BI->getOperand(0)->getType(); @@ -423,9 +422,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles( << "\n With : " << *NewSVI1 << "\n And : " << *NewSVI2 << "\n And : " << *NewBI << "\n"); RecursivelyDeleteTriviallyDeadInstructions(SVI); - if (NewSVI1->getOperand(0) == LoadOp) + if (NewSVI1->getOperand(0) == Load) Shuffles.push_back(NewSVI1); - if (NewSVI2->getOperand(0) == LoadOp) + if (NewSVI2->getOperand(0) == Load) Shuffles.push_back(NewSVI2); } @@ -497,13 +496,13 @@ bool InterleavedAccessImpl::tryReplaceExtracts( } bool InterleavedAccessImpl::lowerInterleavedStore( - Instruction *StoreOp, SmallSetVector &DeadInsts) { + Instruction *Store, SmallSetVector &DeadInsts) { Value *StoredValue; - if (auto *SI = dyn_cast(StoreOp)) { + if (auto *SI = dyn_cast(Store)) { if (!SI->isSimple()) return false; StoredValue = SI->getValueOperand(); - } else if (auto *VPStore = dyn_cast(StoreOp)) { + } else if (auto *VPStore = dyn_cast(Store)) { assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); // Require a constant mask. if (!isa(VPStore->getArgOperand(2))) @@ -526,23 +525,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore( assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); - // Check if the de-interleaved vp.store masks are the same. - if (auto *VPStore = dyn_cast(StoreOp)) { - } - - if (auto *VPStore = dyn_cast(StoreOp)) { + if (auto *VPStore = dyn_cast(Store)) { unsigned LaneMaskLen = NumStoredElements / Factor; Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor, ElementCount::getFixed(LaneMaskLen)); if (!LaneMask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *StoreOp + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store << "\n"); IRBuilder<> Builder(VPStore); // We need to effectively de-interleave the shufflemask - // because lowerInterleavedVPStore expected individual de-interleaved + // because lowerInterleavedVPStore expects individual de-interleaved // values. SmallVector NewShuffles; SmallVector NewShuffleMask(LaneMaskLen); @@ -562,17 +557,16 @@ bool InterleavedAccessImpl::lowerInterleavedStore( // We already created new shuffles. return true; } else { - LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp - << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); // Try to create target specific intrinsics to replace the store and // shuffle. - if (!TLI->lowerInterleavedStore(cast(StoreOp), SVI, Factor)) + if (!TLI->lowerInterleavedStore(cast(Store), SVI, Factor)) return false; } // Already have a new target specific interleaved store. Erase the old store. - DeadInsts.insert(StoreOp); + DeadInsts.insert(Store); DeadInsts.insert(SVI); return true; } @@ -741,20 +735,21 @@ static Value *getMask(Value *WideMask, unsigned Factor, } if (auto *ConstMask = dyn_cast(WideMask)) { - if (auto *Splat = ConstMask->getSplatValue()) { + if (auto *Splat = ConstMask->getSplatValue()) // All-ones or all-zeros mask. return ConstantVector::getSplat(LeafValueEC, Splat); - } else if (LeafValueEC.isFixed()) { + + if (LeafValueEC.isFixed()) { unsigned LeafMaskLen = LeafValueEC.getFixedValue(); SmallVector LeafMask(LeafMaskLen, nullptr); // If this is a fixed-length constant mask, each lane / leaf has to // use the same mask. This is done by checking if every group with Factor // number of elements in the interleaved mask has homogeneous values. for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { - Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor)); - if (Ref != ConstMask->getAggregateElement(Idx)) + Constant *C = ConstMask->getAggregateElement(Idx); + if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) return nullptr; - LeafMask[Idx / Factor] = Ref; + LeafMask[Idx / Factor] = C; } return ConstantVector::get(LeafMask); @@ -793,7 +788,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues)) + if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues)) return false; } else { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7ea64c9b7ca7b..d7c7b54f559b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23999,7 +23999,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// TODO: We probably can loosen the dependency on matching extractvalue when /// dealing with factor of 2 (extractvalue is still required for most of other /// factors though). -bool RISCVTargetLowering::lowerDeinterleavedVPLoad( +bool RISCVTargetLowering::lowerInterleavedVPLoad( VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveResults) const { assert(Mask && "Expect a valid mask"); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a77c6f1208643..7806724b263b3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -931,9 +931,8 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; - bool - lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; + bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const override; bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOps) const override; From 92e8ad64dcc9e9033df2402460977d3432fc4212 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 7 May 2025 13:52:42 -0700 Subject: [PATCH 11/12] fixup! Address review comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 10 +++++----- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index f70b8924ef21b..04d89d61cb6a9 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -270,7 +270,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( } else if (auto *VPLoad = dyn_cast(Load)) { assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); // Require a constant mask. - if (!isa(VPLoad->getArgOperand(1))) + if (!isa(VPLoad->getMaskParam())) return false; } else { llvm_unreachable("unsupported load operation"); @@ -364,8 +364,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); if (auto *VPLoad = dyn_cast(Load)) { - Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, - cast(Shuffles[0]->getType())); + Value *LaneMask = + getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); if (!LaneMask) return false; @@ -505,7 +505,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore( } else if (auto *VPStore = dyn_cast(Store)) { assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); // Require a constant mask. - if (!isa(VPStore->getArgOperand(2))) + if (!isa(VPStore->getMaskParam())) return false; StoredValue = VPStore->getArgOperand(0); } else { @@ -527,7 +527,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore( if (auto *VPStore = dyn_cast(Store)) { unsigned LaneMaskLen = NumStoredElements / Factor; - Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor, + Value *LaneMask = getMask(VPStore->getMaskParam(), Factor, ElementCount::getFixed(LaneMaskLen)); if (!LaneMask) return false; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index d7c7b54f559b1..0aa86ad8ebd58 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -24022,7 +24022,7 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad( IRBuilder<> Builder(Load); - Value *WideEVL = Load->getArgOperand(2); + Value *WideEVL = Load->getVectorLengthParam(); // Conservatively check if EVL is a multiple of factor, otherwise some // (trailing) elements might be lost after the transformation. if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) From 1c0a4a190b77ed498294a310160f53410a604a77 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 7 May 2025 14:27:07 -0700 Subject: [PATCH 12/12] fixup! Add more tests for factor of 7 and 8 --- .../rvv/fixed-vectors-interleaved-access.ll | 75 ++++++++++++++++++- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index c0a6456f01075..8ac4c7447c7d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1431,6 +1431,40 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2 ret void } +define void @store_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) { +; CHECK-LABEL: store_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> + store <14 x i16> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) { +; CHECK-LABEL: store_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> + store <16 x i16> %interleaved.vec, ptr %ptr + ret void +} + define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: vpstore_factor2: ; CHECK: # %bb.0: @@ -1512,6 +1546,39 @@ define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> % ret void } +define void @vpstore_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) { +; CHECK-LABEL: vpstore_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> + tail call void @llvm.vp.store.v14i16.p0(<14 x i16> %interleaved.vec, ptr %ptr, <14 x i1> splat (i1 true), i32 14) + ret void +} + +define void @vpstore_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) { +; CHECK-LABEL: vpstore_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> + tail call void @llvm.vp.store.v16i16.p0(<16 x i16> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16) + ret void +} define <4 x i32> @load_factor2_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor2_one_active: @@ -1703,8 +1770,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI45_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI45_0) +; RV32-NEXT: lui a1, %hi(.LCPI49_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI49_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1779,8 +1846,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI46_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI46_0) +; RV32-NEXT: lui a0, %hi(.LCPI50_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI50_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36