diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 31aec77db63c1..d36173211e7a8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -727,6 +727,31 @@ static SmallBitVector isUndefVector(const Value *V, return Res; } +struct ExtractFromVector { + Value *IndexOperand; + Value *VectorOperand; + + std::optional getConstantIndex() const { + if (auto *CI = dyn_cast(IndexOperand)) + return CI->getZExtValue(); + return {}; + } + + VectorType *getVectorOperandType() const { + return cast(VectorOperand->getType()); + } +}; + +/// Match ExtractElementInst or Intrinsic::vector_extract +static std::optional matchExtractFromVector(Value *V) { + if (auto *EI = dyn_cast(V)) + return ExtractFromVector{EI->getIndexOperand(), EI->getVectorOperand()}; + if (auto *IntrI = dyn_cast(V); + IntrI && IntrI->getIntrinsicID() == Intrinsic::vector_extract) + return ExtractFromVector{IntrI->getOperand(1), IntrI->getOperand(0)}; + return {}; +} + /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -752,42 +777,47 @@ static SmallBitVector isUndefVector(const Value *V, static std::optional isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, AssumptionCache *AC) { - const auto *It = find_if(VL, IsaPred); - if (It == VL.end()) - return std::nullopt; - unsigned Size = + unsigned ShuffleSrcSize = std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) { - auto *EI = dyn_cast(V); - if (!EI) + std::optional EFV = matchExtractFromVector(V); + if (!EFV) return S; - auto *VTy = dyn_cast(EI->getVectorOperandType()); + auto *VTy = dyn_cast(EFV->getVectorOperandType()); if (!VTy) return S; return std::max(S, VTy->getNumElements()); }); + if (ShuffleSrcSize == 0) + return std::nullopt; Value *Vec1 = nullptr; Value *Vec2 = nullptr; bool HasNonUndefVec = any_of(VL, [&](Value *V) { - auto *EE = dyn_cast(V); - if (!EE) + std::optional EFV = matchExtractFromVector(V); + if (!EFV) return false; - Value *Vec = EE->getVectorOperand(); + Value *Vec = EFV->VectorOperand; if (isa(Vec)) return false; return isGuaranteedNotToBePoison(Vec, AC); }); enum ShuffleMode { Unknown, Select, Permute }; ShuffleMode CommonShuffleMode = Unknown; + Type *ExtractedTy = VL[0]->getType(); + unsigned EltsPerExtractInst = getNumElements(ExtractedTy); + + // Note: Mask is for values of VL, which can be of vector type. Mask.assign(VL.size(), PoisonMaskElem); + for (unsigned I = 0, E = VL.size(); I < E; ++I) { // Undef can be represented as an undef element in a vector. if (isa(VL[I])) continue; - auto *EI = cast(VL[I]); - if (isa(EI->getVectorOperandType())) + std::optional EFV = matchExtractFromVector(VL[I]); + assert(EFV.has_value() && "Unexpected shuffle source."); + if (isa(EFV->getVectorOperandType())) return std::nullopt; - auto *Vec = EI->getVectorOperand(); + auto *Vec = EFV->VectorOperand; // We can extractelement from undef or poison vector. if (isUndefVector(Vec).all()) continue; @@ -795,16 +825,15 @@ isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, if (isa(Vec)) { Mask[I] = I; } else { - if (isa(EI->getIndexOperand())) + if (isa(EFV->IndexOperand)) continue; - auto *Idx = dyn_cast(EI->getIndexOperand()); - if (!Idx) + std::optional Idx = EFV->getConstantIndex(); + if (!Idx || *Idx % EltsPerExtractInst != 0) return std::nullopt; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) + if (*Idx >= ShuffleSrcSize) continue; - unsigned IntIdx = Idx->getValue().getZExtValue(); - Mask[I] = IntIdx; + Mask[I] = *Idx / EltsPerExtractInst; } if (isUndefVector(Vec).all() && HasNonUndefVec) continue; @@ -814,7 +843,7 @@ isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, Vec1 = Vec; } else if (!Vec2 || Vec2 == Vec) { Vec2 = Vec; - Mask[I] += Size; + Mask[I] += (ShuffleSrcSize / EltsPerExtractInst); } else { return std::nullopt; } @@ -822,7 +851,7 @@ isFixedVectorShuffle(ArrayRef VL, SmallVectorImpl &Mask, continue; // If the extract index is not the same as the operation number, it is a // permutation. - if (Mask[I] % Size != I) { + if ((Mask[I] * EltsPerExtractInst) % ShuffleSrcSize != I) { CommonShuffleMode = Permute; continue; } @@ -12068,22 +12097,30 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { /// Compute the cost of creating a vector containing the extracted values from /// \p VL. InstructionCost - computeExtractCost(ArrayRef VL, ArrayRef Mask, + computeExtractCost(ArrayRef VL, ArrayRef MaskRef, ArrayRef> ShuffleKinds, unsigned NumParts) { assert(VL.size() > NumParts && "Unexpected scalarized shuffle."); unsigned NumElts = std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) { - auto *EE = dyn_cast(V); - if (!EE) + std::optional EFV = matchExtractFromVector(V); + if (!EFV) return Sz; - auto *VecTy = dyn_cast(EE->getVectorOperandType()); + auto *VecTy = dyn_cast(EFV->getVectorOperandType()); if (!VecTy) return Sz; return std::max(Sz, VecTy->getNumElements()); }); // FIXME: this must be moved to TTI for better estimation. - unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts); + unsigned EltsPerExtractInst = getNumElements(VL[0]->getType()); + unsigned EltsPerVector = + getPartNumElems(VL.size() * EltsPerExtractInst, NumParts); + + // Make sure we get a proper shuffle mask if the elements being extracted + // are subvectors. + SmallVector Mask(MaskRef.begin(), MaskRef.end()); + transformScalarShuffleIndiciesToVector(EltsPerExtractInst, Mask); + auto CheckPerRegistersShuffle = [&](MutableArrayRef Mask, SmallVectorImpl &Indices, SmallVectorImpl &SubVecSizes) @@ -12150,7 +12187,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { for (unsigned Part : seq(NumParts)) { if (!ShuffleKinds[Part]) continue; - ArrayRef MaskSlice = Mask.slice( + ArrayRef MaskSlice = ArrayRef(Mask).slice( Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part)); SmallVector SubMask(EltsPerVector, PoisonMaskElem); copy(MaskSlice, SubMask.begin()); @@ -12555,6 +12592,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }); SmallPtrSet UniqueBases; unsigned SliceSize = getPartNumElems(VL.size(), NumParts); + Type *ExtractedTy = VL[0]->getType(); + const unsigned EltsPerExtractInst = getNumElements(ExtractedTy); SmallDenseMap VectorOpsToExtracts; for (unsigned Part : seq(NumParts)) { unsigned Limit = getNumElems(VL.size(), SliceSize, Part); @@ -12571,13 +12610,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { // vectorized tree. // Also, avoid adjusting the cost for extractelements with multiple uses // in different graph entries. - auto *EE = cast(V); - VecBase = EE->getVectorOperand(); + ExtractFromVector EFV = matchExtractFromVector(V).value(); + Instruction *ExtractInst = cast(V); + VecBase = EFV.VectorOperand; UniqueBases.insert(VecBase); ArrayRef VEs = R.getTreeEntries(V); if (!CheckedExtracts.insert(V).second || !R.areAllUsersVectorized(cast(V), &VectorizedVals) || - any_of(EE->users(), + any_of(V->users(), [&](User *U) { return isa(U) && !R.areAllUsersVectorized(cast(U), @@ -12585,39 +12625,45 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }) || (!VEs.empty() && !is_contained(VEs, E))) continue; - std::optional EEIdx = getExtractIndex(EE); + std::optional EEIdx = EFV.getConstantIndex(); if (!EEIdx) continue; unsigned Idx = *EEIdx; // Take credit for instruction that will become dead. - if (EE->hasOneUse() || !PrevNodeFound) { - Instruction *Ext = EE->user_back(); - if (isa(Ext) && + if (V->hasOneUse() || !PrevNodeFound) { + Instruction *Ext = ExtractInst->user_back(); + if (isa(Ext) && !V->getType()->isVectorTy() && all_of(Ext->users(), IsaPred)) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. Cost -= TTI.getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), EE->getVectorOperandType(), + Ext->getOpcode(), Ext->getType(), EFV.getVectorOperandType(), Idx, CostKind); // Add back the cost of s|zext which is subtracted separately. Cost += TTI.getCastInstrCost( - Ext->getOpcode(), Ext->getType(), EE->getType(), + Ext->getOpcode(), Ext->getType(), V->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } + + // Note: If extracting a subvector, NumExtractedElts will be >1, and the + // bit set in DemandedElts will correspond to a sub-vector index. APInt &DemandedElts = VectorOpsToExtracts .try_emplace(VecBase, - APInt::getZero(getNumElements(VecBase->getType()))) + APInt::getZero(getNumElements(VecBase->getType()) / + EltsPerExtractInst)) .first->getSecond(); - DemandedElts.setBit(Idx); + DemandedElts.setBit(Idx / EltsPerExtractInst); } } + for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts) - Cost -= TTI.getScalarizationOverhead(cast(Vec->getType()), - DemandedElts, /*Insert=*/false, - /*Extract=*/true, CostKind); + Cost -= getScalarizationOverhead(TTI, VL[0]->getType(), + cast(Vec->getType()), + DemandedElts, /*Insert=*/false, + /*Extract=*/true, CostKind); // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. // Found the bunch of extractelement instructions that must be gathered @@ -12704,11 +12750,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { [&](auto P) { if (P.value() == PoisonMaskElem) return Mask[P.index()] == PoisonMaskElem; - auto *EI = cast( - cast(InVectors.front()) - ->getOrdered(P.index())); - return EI->getVectorOperand() == V1 || - EI->getVectorOperand() == V2; + ExtractFromVector EFV = + matchExtractFromVector( + cast(InVectors.front()) + ->getOrdered(P.index())) + .value(); + return EFV.VectorOperand == V1 || EFV.VectorOperand == V2; }) && "Expected extractelement vectors."); } @@ -12734,8 +12781,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { isa(Scalar); if (isa(V1)) return true; - auto *EI = cast(Scalar); - return EI->getVectorOperand() == V1; + ExtractFromVector EFV = + matchExtractFromVector(Scalar).value(); + return EFV.VectorOperand == V1; }) && "Expected only tree entry for extractelement vectors."); return; @@ -15117,19 +15165,20 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( MutableArrayRef VL, SmallVectorImpl &Mask) const { // Scan list of gathered scalars for extractelements that can be represented // as shuffles. + // This maps the vectors we gather from to all the lanes they populate in VL. MapVector> VectorOpToIdx; SmallVector UndefVectorExtracts; for (int I = 0, E = VL.size(); I < E; ++I) { - auto *EI = dyn_cast(VL[I]); - if (!EI) { + std::optional Match = matchExtractFromVector(VL[I]); + if (!Match) { if (isa(VL[I])) UndefVectorExtracts.push_back(I); continue; } - auto *VecTy = dyn_cast(EI->getVectorOperandType()); - if (!VecTy || !isa(EI->getIndexOperand())) + auto *VecTy = dyn_cast(Match->getVectorOperandType()); + if (!VecTy || !isa(Match->IndexOperand)) continue; - std::optional Idx = getExtractIndex(EI); + std::optional Idx = Match->getConstantIndex(); // Undefined index. if (!Idx) { UndefVectorExtracts.push_back(I); @@ -15141,12 +15190,13 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( } SmallBitVector ExtractMask(VecTy->getNumElements(), true); ExtractMask.reset(*Idx); - if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) { + if (isUndefVector(Match->VectorOperand, ExtractMask).all()) { UndefVectorExtracts.push_back(I); continue; } - VectorOpToIdx[EI->getVectorOperand()].push_back(I); + VectorOpToIdx[Match->VectorOperand].push_back(I); } + // Sort the vector operands by the maximum number of uses in extractelements. SmallVector>> Vectors = VectorOpToIdx.takeVector(); @@ -15160,12 +15210,12 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( if (!Vectors.empty()) { SingleMax = Vectors.front().second.size() + UndefSz; if (Vectors.size() > 1) { - auto *ItNext = std::next(Vectors.begin()); - PairMax = SingleMax + ItNext->second.size(); + PairMax = SingleMax + Vectors[1].second.size(); } } if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) return std::nullopt; + // Check if better to perform a shuffle of 2 vectors or just of a single // vector. SmallVector SavedVL(VL.begin(), VL.end()); @@ -15182,6 +15232,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( // Add extracts from undefs too. for (int Idx : UndefVectorExtracts) std::swap(GatheredExtracts[Idx], VL[Idx]); + // Check that gather of extractelements can be represented as just a // shuffle of a single/two vectors the scalars are extracted from. std::optional Res = @@ -15198,13 +15249,7 @@ BoUpSLP::tryToGatherSingleRegisterExtractElements( if (Mask[I] == PoisonMaskElem && !isa(GatheredExtracts[I]) && isa(GatheredExtracts[I])) { std::swap(VL[I], GatheredExtracts[I]); - continue; } - auto *EI = dyn_cast(VL[I]); - if (!EI || !isa(EI->getVectorOperandType()) || - !isa(EI->getIndexOperand()) || - is_contained(UndefVectorExtracts, I)) - continue; } return Res; } @@ -16556,8 +16601,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { int Idx = Mask[I]; if (Idx == PoisonMaskElem) continue; - auto *EI = cast(VL[I]); - VecBase = EI->getVectorOperand(); + ExtractFromVector EFV = matchExtractFromVector(VL[I]).value(); + Instruction *EI = cast(VL[I]); + VecBase = EFV.VectorOperand; if (ArrayRef TEs = R.getTreeEntries(VecBase); !TEs.empty()) VecBase = TEs.front()->VectorizedValue; assert(VecBase && "Expected vectorized value."); @@ -16611,7 +16657,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { if (std::get<1>(D) == PoisonMaskElem) return S; Value *VecOp = - cast(std::get<0>(D))->getVectorOperand(); + matchExtractFromVector(std::get<0>(D))->VectorOperand; if (ArrayRef TEs = R.getTreeEntries(VecOp); !TEs.empty()) VecOp = TEs.front()->VectorizedValue; @@ -16623,7 +16669,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { for (const auto [V, I] : VLMask) { if (I == PoisonMaskElem) continue; - Value *VecOp = cast(V)->getVectorOperand(); + Value *VecOp = matchExtractFromVector(V)->VectorOperand; if (ArrayRef TEs = R.getTreeEntries(VecOp); !TEs.empty()) VecOp = TEs.front()->VectorizedValue; assert(VecOp && "Expected vectorized value."); @@ -17073,7 +17119,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, if (I == PoisonMaskElem) continue; if (ArrayRef TEs = getTreeEntries( - cast(StoredGS[Idx])->getVectorOperand()); + matchExtractFromVector(StoredGS[Idx])->VectorOperand); !TEs.empty()) ExtractEntries.append(TEs.begin(), TEs.end()); } @@ -17280,8 +17326,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, continue; if (isa(StoredGS[I])) continue; - auto *EI = cast(StoredGS[I]); - Value *VecOp = EI->getVectorOperand(); + Value *VecOp = matchExtractFromVector(StoredGS[I])->VectorOperand; if (ArrayRef TEs = getTreeEntries(VecOp); !TEs.empty() && TEs.front()->VectorizedValue) VecOp = TEs.front()->VectorizedValue; diff --git a/llvm/test/Transforms/SLPVectorizer/revec-extractvector.ll b/llvm/test/Transforms/SLPVectorizer/revec-extractvector.ll new file mode 100644 index 0000000000000..423ea26dd1545 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/revec-extractvector.ll @@ -0,0 +1,293 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=256 %s | FileCheck %s + +define i32 @test_32xi8_2parts(ptr %in, ptr %out) { +; CHECK-LABEL: @test_32xi8_2parts( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[IN:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = add <32 x i8> [[TMP0]], [[TMP0]] +; CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <32 x i8>, ptr %in, align 1 + %1 = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %0, i64 0) + %2 = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %0, i64 16) + + %add.1 = add <16 x i8> %1, %1 + %add.2 = add <16 x i8> %2, %2 + + %out.2 = getelementptr inbounds i8, ptr %out, i64 16 + store <16 x i8> %add.1, ptr %out, align 16 + store <16 x i8> %add.2, ptr %out.2, align 16 + ret i32 0 +} + +define i32 @test_32xi8_4_parts(ptr %in, ptr %out) { +; CHECK-LABEL: @test_32xi8_4_parts( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[IN:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = add <32 x i8> [[TMP0]], [[TMP0]] +; CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <32 x i8>, ptr %in, align 1 + %1 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 0) + %2 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 8) + %3 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 16) + %4 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 24) + + %add.1 = add <8 x i8> %1, %1 + %add.2 = add <8 x i8> %2, %2 + %add.3 = add <8 x i8> %3, %3 + %add.4 = add <8 x i8> %4, %4 + + store <8 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 8 + store <8 x i8> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i8, ptr %out, i64 16 + store <8 x i8> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i8, ptr %out, i64 24 + store <8 x i8> %add.4, ptr %out.4, align 8 + ret i32 0 +} + +; Extract the 1st, 2nd and 4th quarters of a <32 x i8> register +; The 3rd quarter comes from a %vec argument. +; We are expecting a wide load followed by a vector.insert +define i32 @test_32xi8_4_parts_sparse(ptr %in, ptr %out, <8 x i8> %vec) { +; CHECK-LABEL: @test_32xi8_4_parts_sparse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[IN:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> [[TMP3]], <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: store <32 x i8> [[TMP2]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <32 x i8>, ptr %in, align 1 + %1 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 0) + %2 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 8) + %4 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 24) + + %add.1 = add <8 x i8> %1, %1 + %add.2 = add <8 x i8> %2, %2 + %add.3 = add <8 x i8> %vec, %vec + %add.4 = add <8 x i8> %4, %4 + + store <8 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 8 + store <8 x i8> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i8, ptr %out, i64 16 + store <8 x i8> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i8, ptr %out, i64 24 + store <8 x i8> %add.4, ptr %out.4, align 8 + ret i32 0 +} + +; Extract the 1st and 3rd quarters of a <32 x i8> register +define i32 @test_32xi8_4_parts_sparse2(ptr %in, ptr %out) { +; CHECK-LABEL: @test_32xi8_4_parts_sparse2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[IN:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <32 x i8>, ptr %in, align 1 + %1 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 0) + %2 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<32 x i8> %0, i64 16) + + %add.1 = add <8 x i8> %1, %1 + %add.2 = add <8 x i8> %2, %2 + + store <8 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 8 + store <8 x i8> %add.2, ptr %out.2, align 8 + ret i32 0 +} + +; Vector size is 256-bit, we should generate 2 <16 x i16> stores. +define i32 @test_32xi16_4_parts(ptr %in, ptr %out) { +; CHECK-LABEL: @test_32xi16_4_parts( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i16>, ptr [[IN:%.*]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i16> [[TMP0]], <32 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i16> [[TMP1]], [[TMP1]] +; CHECK-NEXT: store <16 x i16> [[TMP2]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: [[OUT_3:%.*]] = getelementptr inbounds i16, ptr [[OUT]], i64 16 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP0]], <32 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i16> [[TMP3]], [[TMP3]] +; CHECK-NEXT: store <16 x i16> [[TMP4]], ptr [[OUT_3]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = load <32 x i16>, ptr %in, align 1 + %1 = call <8 x i16> @llvm.vector.extract.v8i16.v32i16(<32 x i16> %0, i64 0) + %2 = call <8 x i16> @llvm.vector.extract.v8i16.v32i16(<32 x i16> %0, i64 8) + %3 = call <8 x i16> @llvm.vector.extract.v8i16.v32i16(<32 x i16> %0, i64 16) + %4 = call <8 x i16> @llvm.vector.extract.v8i16.v32i16(<32 x i16> %0, i64 24) + + %add.1 = add <8 x i16> %1, %1 + %add.2 = add <8 x i16> %2, %2 + %add.3 = add <8 x i16> %3, %3 + %add.4 = add <8 x i16> %4, %4 + + store <8 x i16> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i16, ptr %out, i64 8 + store <8 x i16> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i16, ptr %out, i64 16 + store <8 x i16> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i16, ptr %out, i64 24 + store <8 x i16> %add.4, ptr %out.4, align 8 + ret i32 0 +} + +; Extract lo/hi halves from two different source vectors. +define i32 @test_2x_16xi8(ptr %in0, ptr %in1, ptr %out) { +; CHECK-LABEL: @test_2x_16xi8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = load <16 x i8>, ptr [[IN0:%.*]], align 1 +; CHECK-NEXT: [[BASE1:%.*]] = load <16 x i8>, ptr [[IN1:%.*]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[BASE0]], <16 x i8> [[BASE1]], <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <32 x i8> [[TMP0]], [[TMP0]] +; CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %base0 = load <16 x i8>, ptr %in0, align 1 + %1 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base0, i64 0) + %2 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base0, i64 8) + + %base1 = load <16 x i8>, ptr %in1, align 1 + %3 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base1, i64 0) + %4 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base1, i64 8) + + %add.1 = add <8 x i8> %1, %1 + %add.2 = add <8 x i8> %2, %2 + %add.3 = add <8 x i8> %3, %3 + %add.4 = add <8 x i8> %4, %4 + + store <8 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 8 + store <8 x i8> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i8, ptr %out, i64 16 + store <8 x i8> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i8, ptr %out, i64 24 + store <8 x i8> %add.4, ptr %out.4, align 8 + ret i32 0 +} + +; Extract lo/hi halves from one vector, and hi from another one. +; The hilo quarter comes from a function argument. +; We are expecting a shuffle of two sources, followed by a vector.insert. +define i32 @test_2x_16xi8_sparse(ptr %in0, ptr %in1, ptr %out, <8 x i8> %vec) { +; CHECK-LABEL: @test_2x_16xi8_sparse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE1:%.*]] = load <16 x i8>, ptr [[IN1:%.*]], align 1 +; CHECK-NEXT: [[BASE2:%.*]] = load <16 x i8>, ptr [[IN2:%.*]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[BASE1]], <16 x i8> [[BASE2]], <32 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> [[TMP3]], <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <32 x i8> [[TMP1]], [[TMP1]] +; CHECK-NEXT: store <32 x i8> [[TMP2]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %base0 = load <16 x i8>, ptr %in0, align 1 + %1 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base0, i64 0) + %2 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base0, i64 8) + + %base1 = load <16 x i8>, ptr %in1, align 1 + %4 = call <8 x i8> @llvm.vector.extract.v8i8.v32i8(<16 x i8> %base1, i64 8) + + %add.1 = add <8 x i8> %1, %1 + %add.2 = add <8 x i8> %2, %2 + %add.3 = add <8 x i8> %vec, %vec + %add.4 = add <8 x i8> %4, %4 + + store <8 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 8 + store <8 x i8> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i8, ptr %out, i64 16 + store <8 x i8> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i8, ptr %out, i64 24 + store <8 x i8> %add.4, ptr %out.4, align 8 + ret i32 0 +} + +; Extract lo/hi halves from four different source vectors. +; This is not supported as we are looking for a Gather TreeEntry of +; vector.extract instructions with a maximum of two unique inputs. +; There are four here. +define i32 @test_4x_8xi8(ptr %in0, ptr %in1, ptr %in2, ptr %in3, ptr %out) { +; CHECK-LABEL: @test_4x_8xi8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE0:%.*]] = load <8 x i8>, ptr [[IN0:%.*]], align 1 +; CHECK-NEXT: [[BASE2:%.*]] = load <8 x i8>, ptr [[IN2:%.*]], align 1 +; CHECK-NEXT: [[BASE4:%.*]] = load <8 x i8>, ptr [[IN4:%.*]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> [[BASE4]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> [[BASE4]], i64 4) +; CHECK-NEXT: [[BASE3:%.*]] = load <8 x i8>, ptr [[IN3:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> [[BASE3]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> [[BASE3]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[BASE0]], <8 x i8> [[BASE2]], <32 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> [[TMP5]], <32 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> [[TMP7]], <32 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> [[TMP13]], <32 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP10]], <32 x i8> [[TMP11]], <32 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <32 x i8> [[TMP8]], [[TMP8]] +; CHECK-NEXT: store <32 x i8> [[TMP9]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %base0 = load <8 x i8>, ptr %in0, align 1 + %1 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base0, i64 0) + %2 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base0, i64 4) + + %base1 = load <8 x i8>, ptr %in1, align 1 + %3 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base1, i64 0) + %4 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base1, i64 4) + + %base2 = load <8 x i8>, ptr %in2, align 1 + %5 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base2, i64 0) + %6 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base2, i64 4) + + %base3 = load <8 x i8>, ptr %in3, align 1 + %7 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base3, i64 0) + %8 = call <4 x i8> @llvm.vector.extract.v8i8.v32i8(<8 x i8> %base3, i64 4) + + %add.1 = add <4 x i8> %1, %1 + %add.2 = add <4 x i8> %2, %2 + %add.3 = add <4 x i8> %3, %3 + %add.4 = add <4 x i8> %4, %4 + %add.5 = add <4 x i8> %5, %5 + %add.6 = add <4 x i8> %6, %6 + %add.7 = add <4 x i8> %7, %7 + %add.8 = add <4 x i8> %8, %8 + + store <4 x i8> %add.1, ptr %out, align 8 + %out.2 = getelementptr inbounds i8, ptr %out, i64 4 + store <4 x i8> %add.2, ptr %out.2, align 8 + %out.3 = getelementptr inbounds i8, ptr %out, i64 8 + store <4 x i8> %add.3, ptr %out.3, align 8 + %out.4 = getelementptr inbounds i8, ptr %out, i64 12 + store <4 x i8> %add.4, ptr %out.4, align 8 + %out.5 = getelementptr inbounds i8, ptr %out, i64 16 + store <4 x i8> %add.5, ptr %out.5, align 8 + %out.6 = getelementptr inbounds i8, ptr %out, i64 20 + store <4 x i8> %add.6, ptr %out.6, align 8 + %out.7 = getelementptr inbounds i8, ptr %out, i64 24 + store <4 x i8> %add.7, ptr %out.7, align 8 + %out.8 = getelementptr inbounds i8, ptr %out, i64 28 + store <4 x i8> %add.8, ptr %out.8, align 8 + ret i32 0 +}