diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index abe261728a3e6..03099e9ad44dc 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3216,8 +3216,7 @@ class TargetLoweringBase { /// \p Load is a vp.load instruction. /// \p Mask is a mask value /// \p DeinterleaveRes is a list of deinterleaved results. - virtual bool - lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask, + virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveRes) const { return false; } @@ -3228,9 +3227,8 @@ class TargetLoweringBase { /// \p Store is the vp.store instruction. /// \p Mask is a mask value /// \p InterleaveOps is a list of values being interleaved. - virtual bool - lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const { + virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const { return false; } diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td index 99cb557d9aa09..7da11b93f6b74 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1705,12 +1705,23 @@ let TargetPrefix = "riscv" in { // Segment loads/stores for fixed vectors. foreach nf = [2, 3, 4, 5, 6, 7, 8] in { + // Input: (pointer, vl) def int_riscv_seg # nf # _load : DefaultAttrsIntrinsic, !add(nf, -1))), [llvm_anyptr_ty, llvm_anyint_ty], [NoCapture>, IntrReadMem]>; + // Input: (pointer, mask, vl) + def int_riscv_seg # nf # _load_mask + : DefaultAttrsIntrinsic, + !add(nf, -1))), + [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyint_ty], + [NoCapture>, IntrReadMem]>; + + // Input: (, pointer, vl) def int_riscv_seg # nf # _store : DefaultAttrsIntrinsic<[], !listconcat([llvm_anyvector_ty], @@ -1718,6 +1729,15 @@ let TargetPrefix = "riscv" in { !add(nf, -1)), [llvm_anyptr_ty, llvm_anyint_ty]), [NoCapture>, IntrWriteMem]>; + // Input: (, pointer, mask, vl) + def int_riscv_seg # nf # _store_mask + : DefaultAttrsIntrinsic<[], + !listconcat([llvm_anyvector_ty], + !listsplat(LLVMMatchType<0>, + !add(nf, -1)), + [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyint_ty]), + [NoCapture>, IntrWriteMem]>; } } // TargetPrefix = "riscv" diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 73f41c05711b7..04d89d61cb6a9 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -100,11 +100,11 @@ class InterleavedAccessImpl { unsigned MaxFactor = 0u; /// Transform an interleaved load into target specific intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, SmallSetVector &DeadInsts); /// Transform an interleaved store into target specific intrinsics. - bool lowerInterleavedStore(StoreInst *SI, + bool lowerInterleavedStore(Instruction *Store, SmallSetVector &DeadInsts); /// Transform a load and a deinterleave intrinsic into target specific @@ -131,7 +131,7 @@ class InterleavedAccessImpl { /// made. bool replaceBinOpShuffles(ArrayRef BinOpShuffles, SmallVectorImpl &Shuffles, - LoadInst *LI); + Instruction *LI); }; class InterleavedAccess : public FunctionPass { @@ -249,11 +249,33 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return false; } +// Return the corresponded deinterleaved mask, or nullptr if there is no valid +// mask. +static Value *getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC); + +static Value *getMask(Value *WideMask, unsigned Factor, + VectorType *LeafValueTy) { + return getMask(WideMask, Factor, LeafValueTy->getElementCount()); +} + bool InterleavedAccessImpl::lowerInterleavedLoad( - LoadInst *LI, SmallSetVector &DeadInsts) { - if (!LI->isSimple() || isa(LI->getType())) + Instruction *Load, SmallSetVector &DeadInsts) { + if (isa(Load->getType())) return false; + if (auto *LI = dyn_cast(Load)) { + if (!LI->isSimple()) + return false; + } else if (auto *VPLoad = dyn_cast(Load)) { + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); + // Require a constant mask. + if (!isa(VPLoad->getMaskParam())) + return false; + } else { + llvm_unreachable("unsupported load operation"); + } + // Check if all users of this load are shufflevectors. If we encounter any // users that are extractelement instructions or binary operators, we save // them to later check if they can be modified to extract from one of the @@ -265,7 +287,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( // binop are the same load. SmallSetVector BinOpShuffles; - for (auto *User : LI->users()) { + for (auto *User : Load->users()) { auto *Extract = dyn_cast(User); if (Extract && isa(Extract->getIndexOperand())) { Extracts.push_back(Extract); @@ -294,7 +316,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( unsigned Factor, Index; unsigned NumLoadElements = - cast(LI->getType())->getNumElements(); + cast(Load->getType())->getNumElements(); auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; // Check if the first shufflevector is DE-interleave shuffle. if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, @@ -327,9 +349,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( assert(Shuffle->getShuffleMask().size() <= NumLoadElements); - if (cast(Shuffle->getOperand(0))->getOperand(0) == LI) + if (cast(Shuffle->getOperand(0))->getOperand(0) == Load) Indices.push_back(Index); - if (cast(Shuffle->getOperand(0))->getOperand(1) == LI) + if (cast(Shuffle->getOperand(0))->getOperand(1) == Load) Indices.push_back(Index); } @@ -339,25 +361,45 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( return false; bool BinOpShuffleChanged = - replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI); + replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); - LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); + if (auto *VPLoad = dyn_cast(Load)) { + Value *LaneMask = + getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); + if (!LaneMask) + return false; - // Try to create target specific intrinsics to replace the load and shuffles. - if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) { - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); + + // Sometimes the number of Shuffles might be less than Factor, we have to + // fill the gaps with null. Also, lowerInterleavedVPLoad + // expects them to be sorted. + SmallVector ShuffleValues(Factor, nullptr); + for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) + ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; + if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; + } else { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); + + // Try to create target specific intrinsics to replace the load and + // shuffles. + if (!TLI->lowerInterleavedLoad(cast(Load), Shuffles, Indices, + Factor)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; } DeadInsts.insert_range(Shuffles); - DeadInsts.insert(LI); + DeadInsts.insert(Load); return true; } bool InterleavedAccessImpl::replaceBinOpShuffles( ArrayRef BinOpShuffles, - SmallVectorImpl &Shuffles, LoadInst *LI) { + SmallVectorImpl &Shuffles, Instruction *Load) { for (auto *SVI : BinOpShuffles) { BinaryOperator *BI = cast(SVI->getOperand(0)); Type *BIOp0Ty = BI->getOperand(0)->getType(); @@ -380,9 +422,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles( << "\n With : " << *NewSVI1 << "\n And : " << *NewSVI2 << "\n And : " << *NewBI << "\n"); RecursivelyDeleteTriviallyDeadInstructions(SVI); - if (NewSVI1->getOperand(0) == LI) + if (NewSVI1->getOperand(0) == Load) Shuffles.push_back(NewSVI1); - if (NewSVI2->getOperand(0) == LI) + if (NewSVI2->getOperand(0) == Load) Shuffles.push_back(NewSVI2); } @@ -454,27 +496,77 @@ bool InterleavedAccessImpl::tryReplaceExtracts( } bool InterleavedAccessImpl::lowerInterleavedStore( - StoreInst *SI, SmallSetVector &DeadInsts) { - if (!SI->isSimple()) - return false; + Instruction *Store, SmallSetVector &DeadInsts) { + Value *StoredValue; + if (auto *SI = dyn_cast(Store)) { + if (!SI->isSimple()) + return false; + StoredValue = SI->getValueOperand(); + } else if (auto *VPStore = dyn_cast(Store)) { + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); + // Require a constant mask. + if (!isa(VPStore->getMaskParam())) + return false; + StoredValue = VPStore->getArgOperand(0); + } else { + llvm_unreachable("unsupported store operation"); + } - auto *SVI = dyn_cast(SI->getValueOperand()); + auto *SVI = dyn_cast(StoredValue); if (!SVI || !SVI->hasOneUse() || isa(SVI->getType())) return false; + unsigned NumStoredElements = + cast(SVI->getType())->getNumElements(); // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; + assert(NumStoredElements % Factor == 0 && + "number of stored element should be a multiple of Factor"); + + if (auto *VPStore = dyn_cast(Store)) { + unsigned LaneMaskLen = NumStoredElements / Factor; + Value *LaneMask = getMask(VPStore->getMaskParam(), Factor, + ElementCount::getFixed(LaneMaskLen)); + if (!LaneMask) + return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); + LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store + << "\n"); - // Try to create target specific intrinsics to replace the store and shuffle. - if (!TLI->lowerInterleavedStore(SI, SVI, Factor)) - return false; + IRBuilder<> Builder(VPStore); + // We need to effectively de-interleave the shufflemask + // because lowerInterleavedVPStore expects individual de-interleaved + // values. + SmallVector NewShuffles; + SmallVector NewShuffleMask(LaneMaskLen); + auto ShuffleMask = SVI->getShuffleMask(); + + for (unsigned i = 0; i < Factor; i++) { + for (unsigned j = 0; j < LaneMaskLen; j++) + NewShuffleMask[j] = ShuffleMask[i + Factor * j]; + + NewShuffles.push_back(Builder.CreateShuffleVector( + SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask)); + } + + // Try to create target specific intrinsics to replace the vp.store and + // shuffle. + if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles)) + // We already created new shuffles. + return true; + } else { + LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n"); + + // Try to create target specific intrinsics to replace the store and + // shuffle. + if (!TLI->lowerInterleavedStore(cast(Store), SVI, Factor)) + return false; + } // Already have a new target specific interleaved store. Erase the old store. - DeadInsts.insert(SI); + DeadInsts.insert(Store); DeadInsts.insert(SVI); return true; } @@ -630,10 +722,8 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, return true; } -// Return the corresponded deinterleaved mask, or nullptr if there is no valid -// mask. -static Value *getMask(Value *WideMask, unsigned Factor) { - using namespace llvm::PatternMatch; +static Value *getMask(Value *WideMask, unsigned Factor, + ElementCount LeafValueEC) { if (auto *IMI = dyn_cast(WideMask)) { SmallVector Operands; SmallVector DeadInsts; @@ -644,13 +734,26 @@ static Value *getMask(Value *WideMask, unsigned Factor) { } } - if (match(WideMask, m_AllOnes())) { - // Scale the vector length of all-ones mask. - ElementCount OrigEC = - cast(WideMask->getType())->getElementCount(); - assert(OrigEC.getKnownMinValue() % Factor == 0); - return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor), - cast(WideMask)->getSplatValue()); + if (auto *ConstMask = dyn_cast(WideMask)) { + if (auto *Splat = ConstMask->getSplatValue()) + // All-ones or all-zeros mask. + return ConstantVector::getSplat(LeafValueEC, Splat); + + if (LeafValueEC.isFixed()) { + unsigned LeafMaskLen = LeafValueEC.getFixedValue(); + SmallVector LeafMask(LeafMaskLen, nullptr); + // If this is a fixed-length constant mask, each lane / leaf has to + // use the same mask. This is done by checking if every group with Factor + // number of elements in the interleaved mask has homogeneous values. + for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { + Constant *C = ConstMask->getAggregateElement(Idx); + if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) + return nullptr; + LeafMask[Idx / Factor] = C; + } + + return ConstantVector::get(LeafMask); + } } return nullptr; @@ -673,9 +776,10 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( if (auto *VPLoad = dyn_cast(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) return false; - // Check mask operand. Handle both all-true and interleaved mask. + // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(DeinterleaveValues[0]->getType())); if (!Mask) return false; @@ -684,8 +788,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask, - DeinterleaveValues)) + if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues)) return false; } else { @@ -727,7 +830,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( return false; Value *WideMask = VPStore->getOperand(2); - Value *Mask = getMask(WideMask, Factor); + Value *Mask = getMask(WideMask, Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; @@ -736,8 +840,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( // Since lowerInterleavedStore expects Shuffle and StoreInst, use special // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask, - InterleaveValues)) + if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues)) return false; } else { auto *SI = cast(StoredBy); @@ -763,12 +866,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { SmallSetVector DeadInsts; bool Changed = false; + using namespace PatternMatch; for (auto &I : instructions(F)) { - if (auto *LI = dyn_cast(&I)) - Changed |= lowerInterleavedLoad(LI, DeadInsts); + if (match(&I, m_CombineOr(m_Load(m_Value()), + m_Intrinsic()))) + Changed |= lowerInterleavedLoad(&I, DeadInsts); - if (auto *SI = dyn_cast(&I)) - Changed |= lowerInterleavedStore(SI, DeadInsts); + if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()), + m_Intrinsic()))) + Changed |= lowerInterleavedStore(&I, DeadInsts); if (auto *II = dyn_cast(&I)) { // At present, we only have intrinsics to represent (de)interleaving diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4bcefae8aed03..0aa86ad8ebd58 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1747,6 +1747,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::riscv_seg6_load: case Intrinsic::riscv_seg7_load: case Intrinsic::riscv_seg8_load: + case Intrinsic::riscv_seg2_load_mask: + case Intrinsic::riscv_seg3_load_mask: + case Intrinsic::riscv_seg4_load_mask: + case Intrinsic::riscv_seg5_load_mask: + case Intrinsic::riscv_seg6_load_mask: + case Intrinsic::riscv_seg7_load_mask: + case Intrinsic::riscv_seg8_load_mask: return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_seg2_store: @@ -1760,6 +1767,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2, /*IsStore*/ true, /*IsUnitStrided*/ false, /*UsePtrVal*/ true); + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + // Operands are (vec, ..., vec, ptr, mask, vl) + return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3, + /*IsStore*/ true, + /*IsUnitStrided*/ false, /*UsePtrVal*/ true); case Intrinsic::riscv_vle: case Intrinsic::riscv_vle_mask: case Intrinsic::riscv_vleff: @@ -10569,13 +10587,20 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::riscv_seg5_load: case Intrinsic::riscv_seg6_load: case Intrinsic::riscv_seg7_load: - case Intrinsic::riscv_seg8_load: { + case Intrinsic::riscv_seg8_load: + case Intrinsic::riscv_seg2_load_mask: + case Intrinsic::riscv_seg3_load_mask: + case Intrinsic::riscv_seg4_load_mask: + case Intrinsic::riscv_seg5_load_mask: + case Intrinsic::riscv_seg6_load_mask: + case Intrinsic::riscv_seg7_load_mask: + case Intrinsic::riscv_seg8_load_mask: { SDLoc DL(Op); static const Intrinsic::ID VlsegInts[7] = { - Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, - Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5, - Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, - Intrinsic::riscv_vlseg8}; + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask}; unsigned NF = Op->getNumValues() - 1; assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); MVT XLenVT = Subtarget.getXLenVT(); @@ -10585,7 +10610,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, ContainerVT.getScalarSizeInBits(); EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + // Masked: (pointer, mask, vl) + // Non-masked: (pointer, vl) + bool IsMasked = Op.getNumOperands() > 4; + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = + IsMasked ? Op.getOperand(3) : getAllOnesMask(ContainerVT, VL, DL, DAG); + MVT MaskVT = Mask.getSimpleValueType(); + if (MaskVT.isFixedLengthVector()) { + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + } + SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT); auto *Load = cast(Op); @@ -10595,7 +10632,10 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, IntID, DAG.getUNDEF(VecTupTy), Op.getOperand(2), + Mask, VL, + DAG.getTargetConstant( + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT), DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, @@ -10655,15 +10695,39 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::riscv_seg5_store: case Intrinsic::riscv_seg6_store: case Intrinsic::riscv_seg7_store: - case Intrinsic::riscv_seg8_store: { + case Intrinsic::riscv_seg8_store: + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: { SDLoc DL(Op); static const Intrinsic::ID VssegInts[] = { - Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, - Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, - Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, - Intrinsic::riscv_vsseg8}; - // Operands are (chain, int_id, vec*, ptr, vl) - unsigned NF = Op->getNumOperands() - 4; + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + + bool IsMasked = false; + switch (IntNo) { + case Intrinsic::riscv_seg2_store_mask: + case Intrinsic::riscv_seg3_store_mask: + case Intrinsic::riscv_seg4_store_mask: + case Intrinsic::riscv_seg5_store_mask: + case Intrinsic::riscv_seg6_store_mask: + case Intrinsic::riscv_seg7_store_mask: + case Intrinsic::riscv_seg8_store_mask: + IsMasked = true; + break; + default: + break; + } + + // Non-masked: (chain, int_id, vec*, ptr, vl) + // Masked: (chain, int_id, vec*, ptr, mask, vl) + unsigned NF = Op->getNumOperands() - (IsMasked ? 5 : 4); assert(NF >= 2 && NF <= 8 && "Unexpected seg number"); MVT XLenVT = Subtarget.getXLenVT(); MVT VT = Op->getOperand(2).getSimpleValueType(); @@ -10672,7 +10736,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, ContainerVT.getScalarSizeInBits(); EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + SDValue VL = Op.getOperand(Op.getNumOperands() - 1); + SDValue Mask = IsMasked ? Op.getOperand(Op.getNumOperands() - 2) + : getAllOnesMask(ContainerVT, VL, DL, DAG); + MVT MaskVT = Mask.getSimpleValueType(); + if (MaskVT.isFixedLengthVector()) { + MVT MaskContainerVT = + ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget); + Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget); + } + SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT); SDValue Ptr = Op->getOperand(NF + 2); @@ -10691,6 +10764,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, IntID, StoredVal, Ptr, + Mask, VL, DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)}; @@ -23880,15 +23954,20 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { if (N == 1) return true; + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_c_Mul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + if (isPowerOf2_32(N)) { KnownBits KB = llvm::computeKnownBits(V, DL); return KB.countMinTrailingZeros() >= Log2_32(N); } - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0; + return false; } /// Lower an interleaved vp.load into a vlsegN intrinsic. @@ -23920,7 +23999,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// TODO: We probably can loosen the dependency on matching extractvalue when /// dealing with factor of 2 (extractvalue is still required for most of other /// factors though). -bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( +bool RISCVTargetLowering::lowerInterleavedVPLoad( VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveResults) const { assert(Mask && "Expect a valid mask"); @@ -23929,27 +24008,21 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( const unsigned Factor = DeinterleaveResults.size(); - auto *WideVTy = dyn_cast(Load->getType()); - // TODO: Support fixed vectors. - if (!WideVTy) + auto *VTy = dyn_cast(DeinterleaveResults[0]->getType()); + if (!VTy) return false; - unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue(); - assert(WideNumElements % Factor == 0 && - "ElementCount of a wide load must be divisible by interleave factor"); - auto *VTy = - VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor, - WideVTy->isScalableTy()); auto &DL = Load->getModule()->getDataLayout(); Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(WideVTy->getElementType())); + DL.getABITypeAlign(VTy->getElementType())); if (!isLegalInterleavedAccessType( VTy, Factor, Alignment, Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) return false; IRBuilder<> Builder(Load); - Value *WideEVL = Load->getArgOperand(2); + + Value *WideEVL = Load->getVectorLengthParam(); // Conservatively check if EVL is a multiple of factor, otherwise some // (trailing) elements might be lost after the transformation. if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) @@ -23960,49 +24033,64 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); - static const Intrinsic::ID IntrMaskIds[] = { - Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, - Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, - Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, - Intrinsic::riscv_vlseg8_mask, - }; + Value *Return = nullptr; + if (auto *FVTy = dyn_cast(VTy)) { + static const Intrinsic::ID FixedMaskedVlsegIntrIds[] = { + Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask, + Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask, + Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, + Intrinsic::riscv_seg8_load_mask}; + + Return = Builder.CreateIntrinsic(FixedMaskedVlsegIntrIds[Factor - 2], + {FVTy, XLenTy}, + {Load->getArgOperand(0), Mask, EVL}); + } else { + static const Intrinsic::ID IntrMaskIds[] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask, + }; - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), + NumElts * SEW / 8), + Factor); - Value *PoisonVal = PoisonValue::get(VecTupTy); + Value *PoisonVal = PoisonValue::get(VecTupTy); - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), IntrMaskIds[Factor - 2], - {VecTupTy, Mask->getType(), EVL->getType()}); + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), IntrMaskIds[Factor - 2], + {VecTupTy, Mask->getType(), EVL->getType()}); - Value *Operands[] = {PoisonVal, - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, RISCVVType::TAIL_AGNOSTIC | - RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; + Value *Operands[] = { + PoisonVal, + Load->getArgOperand(0), + Mask, + EVL, + ConstantInt::get(XLenTy, + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); + CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - SmallVector AggrTypes{Factor, VTy}; - Value *Return = - PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); + SmallVector AggrTypes{Factor, VTy}; + Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = + Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } } for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { + if (!DIO) + continue; // We have to create a brand new ExtractValue to replace each // of these old ExtractValue instructions. Value *NewEV = @@ -24033,7 +24121,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad( /// %load2, ptr %ptr, /// %mask, /// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( +bool RISCVTargetLowering::lowerInterleavedVPStore( VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOperands) const { assert(Mask && "Expect a valid mask"); @@ -24042,8 +24130,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( const unsigned Factor = InterleaveOperands.size(); - auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); - // TODO: Support fixed vectors. + auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); if (!VTy) return false; @@ -24067,6 +24154,20 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore( Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), XLenTy); + if (auto *FVTy = dyn_cast(VTy)) { + static const Intrinsic::ID FixedMaskedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + + SmallVector Operands(InterleaveOperands); + Operands.append({Store->getArgOperand(1), Mask, EVL}); + Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy}, + Operands); + return true; + } + static const Intrinsic::ID IntrMaskIds[] = { Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ba24a0c324f51..7806724b263b3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -931,13 +931,11 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; - bool lowerDeinterleavedIntrinsicToVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; + bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveRes) const override; - bool lowerInterleavedIntrinsicToVPStore( - VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOps) const override; + bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOps) const override; bool supportKCFIBundles() const override { return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index f6bdd45330384..8ac4c7447c7d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -176,6 +176,241 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7 } +define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) { +; CHECK-LABEL: vpload_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8) + %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + + +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) { +; CHECK-LABEL: vpload_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +; We only extract some of the fields. +define {<4 x i32>, <4 x i32>} @vpload_factor3_partial(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_partial: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} poison, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v2, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + +; Load a larger vector but only deinterleave a subset of the elements. +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> , i32 12) + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +; Make sure the mask is propagated. +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +; Poison/undef in the shuffle mask shouldn't affect anything. +define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr %ptr) { +; CHECK-LABEL: vpload_factor3_poison_shufflemask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) { +; CHECK-LABEL: vpload_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16) + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3 +} + +; TODO: Add more tests for vp.load/store + (de)interleave intrinsics with fixed vectors. +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(ptr %ptr) { +; CHECK-LABEL: vpload_factor4_intrinsics: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8) + %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) + %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 + %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 + %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) + %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 + %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 + %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) + %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 + %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 + + %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 + %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 + %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 + %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 + ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) { +; CHECK-LABEL: vpload_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor6(ptr %ptr) { +; CHECK-LABEL: vpload_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i16> @llvm.vp.load.v12i16.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v1 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v2 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v3 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v4 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v5 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor7(ptr %ptr) { +; CHECK-LABEL: vpload_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <14 x i16> @llvm.vp.load.v14i16.p0(ptr %ptr, <14 x i1> splat (i1 true), i32 14) + %v0 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v1 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v2 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v3 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v4 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v5 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %v6 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor8(ptr %ptr) { +; CHECK-LABEL: vpload_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <16 x i16> @llvm.vp.load.v16i16.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16) + %v0 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v1 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v2 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v3 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v4 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v5 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v6 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %v7 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6 + %res7 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6, <2 x i16> %v7, 7 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7 +} + ; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) { ; RV32-LABEL: load_factor6_too_big: @@ -192,8 +427,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: li a2, 32 ; RV32-NEXT: lui a3, 12 ; RV32-NEXT: lui a6, 12291 -; RV32-NEXT: lui a7, %hi(.LCPI8_0) -; RV32-NEXT: addi a7, a7, %lo(.LCPI8_0) +; RV32-NEXT: lui a7, %hi(.LCPI20_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI20_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v24, (a5) ; RV32-NEXT: vmv.s.x v0, a3 @@ -278,12 +513,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill ; RV32-NEXT: lui a7, 49164 -; RV32-NEXT: lui a1, %hi(.LCPI8_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) +; RV32-NEXT: lui a1, %hi(.LCPI20_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_1) ; RV32-NEXT: lui t2, 3 ; RV32-NEXT: lui t1, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI8_3) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_3) +; RV32-NEXT: lui a4, %hi(.LCPI20_3) +; RV32-NEXT: addi a4, a4, %lo(.LCPI20_3) ; RV32-NEXT: lui t0, 786624 ; RV32-NEXT: li a5, 48 ; RV32-NEXT: lui a6, 768 @@ -462,8 +697,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: lui a1, %hi(.LCPI8_2) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_2) +; RV32-NEXT: lui a1, %hi(.LCPI20_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_2) ; RV32-NEXT: lui a3, 3073 ; RV32-NEXT: addi a3, a3, -1024 ; RV32-NEXT: vmv.s.x v0, a3 @@ -527,16 +762,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vrgatherei16.vv v28, v8, v3 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v28, v24 -; RV32-NEXT: lui a1, %hi(.LCPI8_4) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV32-NEXT: lui a2, %hi(.LCPI8_5) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) +; RV32-NEXT: lui a1, %hi(.LCPI20_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_4) +; RV32-NEXT: lui a2, %hi(.LCPI20_5) +; RV32-NEXT: addi a2, a2, %lo(.LCPI20_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v24, (a2) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v8, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_7) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) +; RV32-NEXT: lui a1, %hi(.LCPI20_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_7) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vle16.v v10, (a1) ; RV32-NEXT: csrr a1, vlenb @@ -564,14 +799,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vrgatherei16.vv v16, v0, v10 -; RV32-NEXT: lui a1, %hi(.LCPI8_6) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) -; RV32-NEXT: lui a2, %hi(.LCPI8_8) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) +; RV32-NEXT: lui a1, %hi(.LCPI20_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_6) +; RV32-NEXT: lui a2, %hi(.LCPI20_8) +; RV32-NEXT: addi a2, a2, %lo(.LCPI20_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RV32-NEXT: vle16.v v4, (a1) -; RV32-NEXT: lui a1, %hi(.LCPI8_9) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) +; RV32-NEXT: lui a1, %hi(.LCPI20_9) +; RV32-NEXT: addi a1, a1, %lo(.LCPI20_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV32-NEXT: vle16.v v6, (a1) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma @@ -658,8 +893,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: li a4, 128 ; RV64-NEXT: lui a1, 1 ; RV64-NEXT: vle64.v v8, (a3) -; RV64-NEXT: lui a3, %hi(.LCPI8_0) -; RV64-NEXT: addi a3, a3, %lo(.LCPI8_0) +; RV64-NEXT: lui a3, %hi(.LCPI20_0) +; RV64-NEXT: addi a3, a3, %lo(.LCPI20_0) ; RV64-NEXT: vmv.s.x v0, a4 ; RV64-NEXT: csrr a4, vlenb ; RV64-NEXT: li a5, 61 @@ -847,8 +1082,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vslideup.vi v12, v16, 1, v0.t -; RV64-NEXT: lui a2, %hi(.LCPI8_1) -; RV64-NEXT: addi a2, a2, %lo(.LCPI8_1) +; RV64-NEXT: lui a2, %hi(.LCPI20_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI20_1) ; RV64-NEXT: li a3, 192 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v6, (a2) @@ -882,8 +1117,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgatherei16.vv v24, v16, v6 ; RV64-NEXT: addi a2, sp, 16 ; RV64-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill -; RV64-NEXT: lui a2, %hi(.LCPI8_2) -; RV64-NEXT: addi a2, a2, %lo(.LCPI8_2) +; RV64-NEXT: lui a2, %hi(.LCPI20_2) +; RV64-NEXT: addi a2, a2, %lo(.LCPI20_2) ; RV64-NEXT: li a3, 1040 ; RV64-NEXT: vmv.s.x v0, a3 ; RV64-NEXT: addi a1, a1, -2016 @@ -967,12 +1202,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) +; RV64-NEXT: lui a1, %hi(.LCPI20_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_3) ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vle16.v v20, (a1) -; RV64-NEXT: lui a1, %hi(.LCPI8_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) +; RV64-NEXT: lui a1, %hi(.LCPI20_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_4) ; RV64-NEXT: vle16.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 77 @@ -1023,8 +1258,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vl2r.v v8, (a1) # vscale x 16-byte Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vrgatherei16.vv v0, v16, v8 -; RV64-NEXT: lui a1, %hi(.LCPI8_5) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) +; RV64-NEXT: lui a1, %hi(.LCPI20_5) +; RV64-NEXT: addi a1, a1, %lo(.LCPI20_5) ; RV64-NEXT: vle16.v v20, (a1) ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 61 @@ -1196,6 +1431,154 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2 ret void } +define void @store_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) { +; CHECK-LABEL: store_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> + store <14 x i16> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) { +; CHECK-LABEL: store_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> + store <16 x i16> %interleaved.vec, ptr %ptr + ret void +} + +define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: vpstore_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> splat (i1 true), i32 8) + ret void +} + +define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12) + ret void +} + +define void @vpstore_factor3_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vpstore_factor3_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vsseg3e32.v v8, (a0), v0.t +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> , i32 12) + ret void +} + +define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: vpstore_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16) + ret void +} + +define void @vpstore_factor5(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { +; CHECK-LABEL: vpstore_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %s2 = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + %s3 = shufflevector <4 x i32> %v4, <4 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s2, <16 x i32> %s3, <20 x i32> + tail call void @llvm.vp.store.v20i32.p0(<20 x i32> %interleaved.vec, ptr %ptr, <20 x i1> splat (i1 true), i32 20) + ret void +} + +define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) { +; CHECK-LABEL: vpstore_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s3 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s2, <8 x i16> %s3, <12 x i32> + tail call void @llvm.vp.store.v12i16.p0(<12 x i16> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12) + ret void +} + +define void @vpstore_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) { +; CHECK-LABEL: vpstore_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> + tail call void @llvm.vp.store.v14i16.p0(<14 x i16> %interleaved.vec, ptr %ptr, <14 x i1> splat (i1 true), i32 14) + ret void +} + +define void @vpstore_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) { +; CHECK-LABEL: vpstore_factor8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> + %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> + %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> + tail call void @llvm.vp.store.v16i16.p0(<16 x i16> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16) + ret void +} define <4 x i32> @load_factor2_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor2_one_active: @@ -1368,3 +1751,157 @@ define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) { store <16 x i32> %v0, ptr %ptr ret void } + +; Negative tests + +define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { +; RV32-LABEL: invalid_vp_mask: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 73 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.s.x v11, a1 +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: vmv.v.i v10, 8 +; RV32-NEXT: addi a1, a1, -43 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: li a1, 146 +; RV32-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0), v0.t +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vmv.s.x v20, a1 +; RV32-NEXT: lui a1, %hi(.LCPI49_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI49_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v21, (a1) +; RV32-NEXT: vcompress.vm v8, v12, v11 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v20 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v21 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_vp_mask: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 73 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.s.x v11, a1 +; RV64-NEXT: li a1, 146 +; RV64-NEXT: vmv.s.x v20, a1 +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vmv.v.i v10, 8 +; RV64-NEXT: addi a1, a1, -43 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: li a1, 36 +; RV64-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0), v0.t +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v11 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v20 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> , i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { +; RV32-LABEL: invalid_vp_evl: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 10, e32, m4, ta, ma +; RV32-NEXT: vle32.v v12, (a0) +; RV32-NEXT: li a0, 73 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 8 +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: li a0, 146 +; RV32-NEXT: vmv.s.x v11, a0 +; RV32-NEXT: lui a0, %hi(.LCPI50_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI50_0) +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle16.v v20, (a0) +; RV32-NEXT: li a0, 36 +; RV32-NEXT: vcompress.vm v8, v12, v10 +; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV32-NEXT: vslidedown.vi v16, v12, 8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV32-NEXT: vcompress.vm v14, v12, v11 +; RV32-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV32-NEXT: vmv.s.x v0, a0 +; RV32-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV32-NEXT: vrgatherei16.vv v10, v12, v20 +; RV32-NEXT: vmv1r.v v9, v14 +; RV32-NEXT: ret +; +; RV64-LABEL: invalid_vp_evl: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 10, e32, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: li a0, 73 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 8 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: li a0, 146 +; RV64-NEXT: vmv.s.x v11, a0 +; RV64-NEXT: li a0, 36 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vcompress.vm v8, v12, v10 +; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; RV64-NEXT: vslidedown.vi v16, v12, 8 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64-NEXT: vrgather.vi v8, v16, 1, v0.t +; RV64-NEXT: vcompress.vm v14, v12, v11 +; RV64-NEXT: vrgather.vi v14, v16, 2, v0.t +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: li a0, 3 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: addi a0, a0, 5 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 2 +; RV64-NEXT: vmerge.vvm v12, v16, v12, v0 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vrgatherei16.vv v10, v12, v9 +; RV64-NEXT: vmv1r.v v9, v14 +; RV64-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 10) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index d6e1af59e6341..d0f35aa8b85e9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -616,59 +616,6 @@ define void @not_balanced_store_tree( %v0, ret void } -; We only support scalable vectors for now. -define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) { -; RV32-LABEL: not_scalable_vectors: -; RV32: # %bb.0: -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v12, v8, a0 -; RV32-NEXT: vnsrl.wi v11, v8, 0 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vnsrl.wx v10, v11, a0 -; RV32-NEXT: vnsrl.wi v8, v11, 0 -; RV32-NEXT: vnsrl.wx v11, v12, a0 -; RV32-NEXT: vnsrl.wi v9, v12, 0 -; RV32-NEXT: ret -; -; RV64-LABEL: not_scalable_vectors: -; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 34 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vnsrl.wx v12, v8, a0 -; RV64-NEXT: vnsrl.wi v11, v8, 0 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vnsrl.wx v10, v11, a0 -; RV64-NEXT: vnsrl.wi v8, v11, 0 -; RV64-NEXT: vnsrl.wx v11, v12, a0 -; RV64-NEXT: vnsrl.wi v9, v12, 0 -; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 - %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl) - %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load) - %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0 - %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1 - %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0) - %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0 - %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1 - %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1) - %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0 - %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1 - - %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0 - %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1 - %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2 - %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3 - ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3 -} - define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { ; RV32-LABEL: not_same_mask: ; RV32: # %bb.0: