diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index e2dd4976f3906..2a419560be303 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -18,6 +18,7 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/IR/Module.h" #include "llvm/IR/VFABIDemangler.h" +#include "llvm/IR/VectorUtils.h" #include "llvm/Support/CheckedArithmetic.h" namespace llvm { @@ -127,18 +128,8 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. If -/// the incoming type is void, we return void. If the EC represents a -/// scalar, we return the scalar type. -inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { - if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) - return Scalar; - return VectorType::get(Scalar, EC); -} - -inline Type *ToVectorTy(Type *Scalar, unsigned VF) { - return ToVectorTy(Scalar, ElementCount::getFixed(VF)); -} +/// Returns true if `Ty` can be widened by the loop vectorizer. +bool canWidenType(Type *Ty); /// Identify if the intrinsic is trivially vectorizable. /// This method returns true if the intrinsic's argument types are all scalars diff --git a/llvm/include/llvm/IR/VectorUtils.h b/llvm/include/llvm/IR/VectorUtils.h new file mode 100644 index 0000000000000..e8e838d8287c4 --- /dev/null +++ b/llvm/include/llvm/IR/VectorUtils.h @@ -0,0 +1,53 @@ +//===----------- VectorUtils.h - Vector type utility functions -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/DerivedTypes.h" + +namespace llvm { + +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) + return Scalar; + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, ElementCount::getFixed(VF)); +} + +/// A helper for converting to wider (vector) types. For scalar types, this is +/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// struct where each element type has been widened to a vector type. Note: Only +/// unpacked literal struct types are supported. +Type *ToWideTy(Type *Ty, ElementCount EC); + +/// A helper for converting wide types to narrow (non-vector) types. For vector +/// types, this is equivalent to calling .getScalarType(). For struct types, +/// this returns a new struct where each element type has been converted to a +/// scalar type. Note: Only unpacked literal struct types are supported. +Type *ToNarrowTy(Type *Ty); + +/// Returns the types contained in `Ty`. For struct types, it returns the +/// elements, all other types are returned directly. +SmallVector getContainedTypes(Type *Ty); + +/// Returns true if `Ty` is a vector type or a struct of vector types where all +/// vector types share the same VF. +bool isWideTy(Type *Ty); + +/// Returns the vectorization factor for a widened type. +inline ElementCount getWideTypeVF(Type *Ty) { + assert(isWideTy(Ty) && "expected widened type!"); + return cast(getContainedTypes(Ty).front())->getElementCount(); +} + +} // namespace llvm diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 6b5251e0ad34e..0e99abd48991e 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -39,6 +39,20 @@ static cl::opt MaxInterleaveGroupFactor( cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8)); +/// Returns true if `Ty` can be widened by the loop vectorizer. +bool llvm::canWidenType(Type *Ty) { + Type *ElTy = Ty; + // For now, only allow widening non-packed literal structs where all + // element types are the same. This simplifies the cost model and + // conversion between scalar and wide types. + if (auto *StructTy = dyn_cast(Ty); + StructTy && !StructTy->isPacked() && StructTy->isLiteral() && + StructTy->containsHomogeneousTypes()) { + ElTy = StructTy->elements().front(); + } + return VectorType::isValidElementType(ElTy); +} + /// Return true if all of the intrinsic's arguments and return type are scalars /// for the scalar form of the intrinsic, and vectors for the vector form of the /// intrinsic (except operands that are marked as always being scalar by diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 544f4ea9223d0..7eaf35e10ebc6 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -73,6 +73,7 @@ add_llvm_component_library(LLVMCore Value.cpp ValueSymbolTable.cpp VectorBuilder.cpp + VectorUtils.cpp Verifier.cpp VFABIDemangler.cpp RuntimeLibcalls.cpp diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp index cdfb9fbfaa084..6ccd77fd23793 100644 --- a/llvm/lib/IR/VFABIDemangler.cpp +++ b/llvm/lib/IR/VFABIDemangler.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Module.h" +#include "llvm/IR/VectorUtils.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include @@ -346,12 +347,15 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA, // Also check the return type if not void. Type *RetTy = Signature->getReturnType(); if (!RetTy->isVoidTy()) { - std::optional ReturnEC = getElementCountForTy(ISA, RetTy); - // If we have an unknown scalar element type we can't find a reasonable VF. - if (!ReturnEC) - return std::nullopt; - if (ElementCount::isKnownLT(*ReturnEC, MinEC)) - MinEC = *ReturnEC; + for (Type *RetTy : getContainedTypes(RetTy)) { + std::optional ReturnEC = getElementCountForTy(ISA, RetTy); + // If we have an unknown scalar element type we can't find a reasonable + // VF. + if (!ReturnEC) + return std::nullopt; + if (ElementCount::isKnownLT(*ReturnEC, MinEC)) + MinEC = *ReturnEC; + } } // The SVE Vector function call ABI bases the VF on the widest element types @@ -566,7 +570,7 @@ FunctionType *VFABI::createFunctionType(const VFInfo &Info, auto *RetTy = ScalarFTy->getReturnType(); if (!RetTy->isVoidTy()) - RetTy = VectorType::get(RetTy, VF); + RetTy = ToWideTy(RetTy, VF); return FunctionType::get(RetTy, VecTypes, false); } diff --git a/llvm/lib/IR/VectorUtils.cpp b/llvm/lib/IR/VectorUtils.cpp new file mode 100644 index 0000000000000..c89a8eaf2ad1e --- /dev/null +++ b/llvm/lib/IR/VectorUtils.cpp @@ -0,0 +1,69 @@ +//===----------- VectorUtils.cpp - Vector type utility functions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/VectorUtils.h" +#include "llvm/ADT/SmallVectorExtras.h" + +using namespace llvm; + +/// A helper for converting to wider (vector) types. For scalar types, this is +/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// struct where each element type has been widened to a vector type. Note: Only +/// unpacked literal struct types are supported. +Type *llvm::ToWideTy(Type *Ty, ElementCount EC) { + if (EC.isScalar()) + return Ty; + auto *StructTy = dyn_cast(Ty); + if (!StructTy) + return ToVectorTy(Ty, EC); + assert(StructTy->isLiteral() && !StructTy->isPacked() && + "expected unpacked struct literal"); + return StructType::get( + Ty->getContext(), + map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * { + return VectorType::get(ElTy, EC); + })); +} + +/// A helper for converting wide types to narrow (non-vector) types. For vector +/// types, this is equivalent to calling .getScalarType(). For struct types, +/// this returns a new struct where each element type has been converted to a +/// scalar type. Note: Only unpacked literal struct types are supported. +Type *llvm::ToNarrowTy(Type *Ty) { + auto *StructTy = dyn_cast(Ty); + if (!StructTy) + return Ty->getScalarType(); + assert(StructTy->isLiteral() && !StructTy->isPacked() && + "expected unpacked struct literal"); + return StructType::get( + Ty->getContext(), + map_to_vector(StructTy->elements(), [](Type *ElTy) -> Type * { + return ElTy->getScalarType(); + })); +} + +/// Returns the types contained in `Ty`. For struct types, it returns the +/// elements, all other types are returned directly. +SmallVector llvm::getContainedTypes(Type *Ty) { + auto *StructTy = dyn_cast(Ty); + if (StructTy) + return to_vector<2>(StructTy->elements()); + return {Ty}; +} + +/// Returns true if `Ty` is a vector type or a struct of vector types where all +/// vector types share the same VF. +bool llvm::isWideTy(Type *Ty) { + auto ContainedTys = getContainedTypes(Ty); + if (ContainedTys.empty() || !ContainedTys.front()->isVectorTy()) + return false; + ElementCount VF = cast(ContainedTys.front())->getElementCount(); + return all_of(ContainedTys, [&](Type *Ty) { + return Ty->isVectorTy() && cast(Ty)->getElementCount() == VF; + }); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 43be72f0f34d4..05d1de9032a6e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -946,11 +946,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !VFDatabase::getMappings(*CI).empty()) VecCallVariantsFound = true; + // TODO: Tidy up these checks. + auto canWidenInst = [](Instruction &I) { + Type *InstTy = I.getType(); + if (isa(I) && isa(InstTy) && + canWidenType(InstTy)) { + // We can only widen struct calls where the users are extractvalues. + for (auto &U : I.uses()) { + if (!isa(U.getUser())) + return false; + } + return true; + } + return VectorType::isValidElementType(InstTy) || InstTy->isVoidTy(); + }; + // Check that the instruction return type is vectorizable. // We can't vectorize casts from vector type to scalar type. // Also, we can't vectorize extractelement instructions. - if ((!VectorType::isValidElementType(I.getType()) && - !I.getType()->isVoidTy()) || + // TODO: Tidy up these checks. + if (!canWidenInst(I) || (isa(I) && !VectorType::isValidElementType(I.getOperand(0)->getType())) || isa(I)) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1c8d541ef2c51..e382e73013b8a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -168,7 +168,7 @@ class VPBuilder { VPInstruction *createOverflowingOp(unsigned Opcode, std::initializer_list Operands, - VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, + VPRecipeIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); @@ -187,9 +187,9 @@ class VPBuilder { VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, const Twine &Name = "") { - return tryInsertInstruction(new VPInstruction( - Instruction::BinaryOps::Or, {LHS, RHS}, - VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); + return tryInsertInstruction( + new VPInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, + VPRecipeIRFlags::DisjointFlagsTy(false), DL, Name)); } VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, @@ -223,12 +223,12 @@ class VPBuilder { VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name)); + Ptr, Offset, VPRecipeIRFlags::GEPFlagsTy(false), DL, Name)); } VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( - Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name)); + Ptr, Offset, VPRecipeIRFlags::GEPFlagsTy(true), DL, Name)); } VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8bf92f3480620..fe335c45e5b53 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2861,10 +2861,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, return ScalarCallCost; } -static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); +static Type *maybeVectorizeType(Type *Ty, ElementCount VF) { + if (VF.isScalar() || !canWidenType(Ty)) + return Ty; + return ToWideTy(Ty, VF); } InstructionCost @@ -3635,9 +3635,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // ExtractValue instructions must be uniform, because the operands are // known to be loop-invariant. - if (auto *EVI = dyn_cast(&I)) { - assert(IsOutOfScope(EVI->getAggregateOperand()) && - "Expected aggregate value to be loop invariant"); + if (auto *EVI = dyn_cast(&I); + EVI && IsOutOfScope(EVI->getAggregateOperand())) { AddToWorklistIfAllowed(EVI); continue; } @@ -5461,10 +5460,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // and phi nodes. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { - ScalarCost += TTI.getScalarizationOverhead( - cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, - /*Extract*/ false, CostKind); + Type *WideTy = ToWideTy(I->getType(), VF); + for (Type *VectorTy : getContainedTypes(WideTy)) { + ScalarCost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); + } ScalarCost += VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -5953,13 +5955,17 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( return 0; InstructionCost Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); + Type *RetTy = ToWideTy(I->getType(), VF); if (!RetTy->isVoidTy() && - (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert*/ true, - /*Extract*/ false, CostKind); + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) { + + for (Type *VectorTy : getContainedTypes(RetTy)) { + Cost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); + } + } // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6219,9 +6225,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool MaskRequired = Legal->isMaskRequired(CI); // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); + Type *RetTy = ToWideTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); + Tys.push_back(ToWideTy(ScalarTy, VF)); // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. @@ -6398,7 +6404,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = ToWideTy(RetTy, VF); if (VF.isVector() && VectorTy->isVectorTy() && !TTI.getNumberOfParts(VectorTy)) @@ -7356,6 +7362,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, return dyn_cast_or_null(S->getUnderlyingValue()); if (auto *WidenMem = dyn_cast(R)) return &WidenMem->getIngredient(); + if (auto *WidenCall = dyn_cast(R)) + return WidenCall->getUnderlyingCallInstruction(); return nullptr; }; @@ -8331,9 +8339,9 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, return new VPBlendRecipe(Phi, OperandsWithMask); } -VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, - ArrayRef Operands, - VFRange &Range) { +VPRecipeBase *VPRecipeBuilder::tryToWidenCall(CallInst *CI, + ArrayRef Operands, + VFRange &Range) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI, VF); @@ -9043,6 +9051,20 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // TODO: Model and preserve debug intrinsics in VPlan. for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) { Instruction *Instr = &I; + + // Special case: Handle extractvalues from struct ret calls. + if (auto *ExtractValue = dyn_cast(Instr)) { + if (auto *CI = + dyn_cast(ExtractValue->getAggregateOperand())) { + auto *R = RecipeBuilder.getRecipe(cast(CI)); + assert(R->getNumDefinedValues() == + cast(CI->getType())->getNumElements()); + unsigned Idx = ExtractValue->getIndices()[0]; + RecipeBuilder.setRecipe(Instr, R->getVPValue(Idx)); + continue; + } + } + SmallVector Operands; auto *Phi = dyn_cast(Instr); if (Phi && Phi->getParent() == HeaderBB) { diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 5d4a3b555981c..1e37386714d2e 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -54,9 +54,11 @@ class VPRecipeBuilder { EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + using RecipeOrResult = PointerUnion; + // VPlan construction support: Hold a mapping from ingredients to // their recipe. - DenseMap Ingredient2Recipe; + DenseMap Ingredient2Recipe; /// Cross-iteration reduction & first-order recurrence phis for which we need /// to add the incoming value from the backedge after all recipes have been @@ -95,8 +97,8 @@ class VPRecipeBuilder { /// Handle call instructions. If \p CI can be widened for \p Range.Start, /// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be /// decreased to ensure same decision from \p Range.Start to \p Range.End. - VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range); + VPRecipeBase *tryToWidenCall(CallInst *CI, ArrayRef Operands, + VFRange &Range); /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates @@ -132,6 +134,15 @@ class VPRecipeBuilder { Ingredient2Recipe[I] = R; } + // Set the recipe value for a given ingredient. + void setRecipe(Instruction *I, VPValue *RecipeResult) { + assert(!Ingredient2Recipe.contains(I) && + "Cannot reset recipe for instruction."); + assert(RecipeResult->getDefiningRecipe() && + "Value must be defined by a recipe."); + Ingredient2Recipe[I] = RecipeResult; + } + /// Create the mask for the vector loop header block. void createHeaderMask(); @@ -158,9 +169,11 @@ class VPRecipeBuilder { VPRecipeBase *getRecipe(Instruction *I) { assert(Ingredient2Recipe.count(I) && "Recording this ingredients recipe was not requested"); - assert(Ingredient2Recipe[I] != nullptr && - "Ingredient doesn't have a recipe"); - return Ingredient2Recipe[I]; + assert(Ingredient2Recipe[I] && "Ingredient doesn't have a recipe"); + auto RecipeInfo = Ingredient2Recipe[I]; + if (auto *R = dyn_cast(RecipeInfo)) + return R; + return cast(RecipeInfo)->getDefiningRecipe(); } /// Build a VPReplicationRecipe for \p I. If it is predicated, add the mask as @@ -179,8 +192,11 @@ class VPRecipeBuilder { VPValue *getVPValueOrAddLiveIn(Value *V) { if (auto *I = dyn_cast(V)) { - if (auto *R = Ingredient2Recipe.lookup(I)) - return R->getVPSingleValue(); + if (auto RecipeInfo = Ingredient2Recipe.lookup(I)) { + if (auto *R = dyn_cast(RecipeInfo)) + return R->getVPSingleValue(); + return cast(RecipeInfo); + } } return Plan.getOrAddLiveIn(V); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4cef47e69f0e3..9aa895b91d1e4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -740,6 +740,8 @@ struct VPCostContext { bool skipCostComputation(Instruction *UI, bool IsVector) const; }; +class VPRecipeIRFlags; + /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -845,6 +847,9 @@ class VPRecipeBase : public ilist_node_with_parent, /// Returns the debug location of the recipe. DebugLoc getDebugLoc() const { return DL; } + /// Returns the IR flags for the recipe. + virtual VPRecipeIRFlags *getIRFlags() { return nullptr; } + protected: /// Compute the cost of this recipe either using a recipe's specialized /// implementation or using the legacy cost model and the underlying @@ -902,7 +907,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: - case VPRecipeBase::VPWidenCallSC: case VPRecipeBase::VPWidenCanonicalIVSC: case VPRecipeBase::VPWidenCastSC: case VPRecipeBase::VPWidenGEPSC: @@ -924,6 +928,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenCallSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -956,8 +961,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { VPCostContext &Ctx) const override; }; -/// Class to record LLVM IR flag for a recipe along with it. -class VPRecipeWithIRFlags : public VPSingleDefRecipe { +/// Class to record LLVM IR flag for a recipe. +class VPRecipeIRFlags { enum class OperationType : unsigned char { Cmp, OverflowingBinOp, @@ -1019,23 +1024,10 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { unsigned AllFlags; }; -protected: - void transferFlags(VPRecipeWithIRFlags &Other) { - OpType = Other.OpType; - AllFlags = Other.AllFlags; - } - public: - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL) { - OpType = OperationType::Other; - AllFlags = 0; - } + VPRecipeIRFlags() : OpType(OperationType::Other), AllFlags(0) {} - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I) - : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) { + VPRecipeIRFlags(Instruction &I) { if (auto *Op = dyn_cast(&I)) { OpType = OperationType::Cmp; CmpPredicate = Op->getPredicate(); @@ -1063,53 +1055,22 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { } } - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - CmpInst::Predicate Pred, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp), - CmpPredicate(Pred) {} + VPRecipeIRFlags(CmpInst::Predicate Pred) + : OpType(OperationType::Cmp), CmpPredicate(Pred) {} - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - WrapFlagsTy WrapFlags, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL), - OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {} + VPRecipeIRFlags(WrapFlagsTy WrapFlags) + : OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {} - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - FastMathFlags FMFs, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp), - FMFs(FMFs) {} + VPRecipeIRFlags(FastMathFlags FMFs) + : OpType(OperationType::FPMathOp), FMFs(FMFs) {} - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - DisjointFlagsTy DisjointFlags, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), - DisjointFlags(DisjointFlags) {} + VPRecipeIRFlags(DisjointFlagsTy DisjointFlags) + : OpType(OperationType::DisjointOp), DisjointFlags(DisjointFlags) {} -protected: - template - VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - GEPFlagsTy GEPFlags, DebugLoc DL = {}) - : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp), - GEPFlags(GEPFlags) {} + VPRecipeIRFlags(GEPFlagsTy GEPFlags) + : OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} public: - static inline bool classof(const VPRecipeBase *R) { - return R->getVPDefID() == VPRecipeBase::VPInstructionSC || - R->getVPDefID() == VPRecipeBase::VPWidenSC || - R->getVPDefID() == VPRecipeBase::VPWidenEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || - R->getVPDefID() == VPRecipeBase::VPWidenCastSC || - R->getVPDefID() == VPRecipeBase::VPReplicateSC || - R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; - } - - static inline bool classof(const VPUser *U) { - auto *R = dyn_cast(U); - return R && classof(R); - } - /// Drop all poison-generating flags. void dropPoisonGeneratingFlags() { // NOTE: This needs to be kept in-sync with @@ -1218,6 +1179,53 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { #endif }; +class VPSingleDefRecipeWithIRFlags : public VPSingleDefRecipe, + public VPRecipeIRFlags { +public: + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags() {} + + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + Instruction &I) + : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()), + VPRecipeIRFlags(I) {} + + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + CmpInst::Predicate Pred, DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(Pred) {} + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + VPRecipeIRFlags::WrapFlagsTy WrapFlags, + DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(WrapFlags) {} + + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + FastMathFlags FMFs, DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(FMFs) {} + + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + VPRecipeIRFlags::DisjointFlagsTy DisjointFlags, + DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(DisjointFlags) {} + + virtual VPRecipeIRFlags *getIRFlags() override { + return static_cast(this); + } + +protected: + template + VPSingleDefRecipeWithIRFlags(const unsigned char SC, IterT Operands, + VPRecipeIRFlags::GEPFlagsTy GEPFlags, + DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), VPRecipeIRFlags(GEPFlags) {} +}; + /// Helper to access the operand that contains the unroll part for this recipe /// after unrolling. template class VPUnrollPartAccessor { @@ -1234,7 +1242,7 @@ template class VPUnrollPartAccessor { /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeWithIRFlags, +class VPInstruction : public VPSingleDefRecipeWithIRFlags, public VPUnrollPartAccessor<1> { friend class VPlanSlp; @@ -1309,7 +1317,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public: VPInstruction(unsigned Opcode, ArrayRef Operands, DebugLoc DL, const Twine &Name = "") - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list Operands, @@ -1320,22 +1328,27 @@ class VPInstruction : public VPRecipeWithIRFlags, VPValue *B, DebugLoc DL = {}, const Twine &Name = ""); VPInstruction(unsigned Opcode, std::initializer_list Operands, - WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "") - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL), + VPRecipeIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL = {}, + const Twine &Name = "") + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, + WrapFlags, DL), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list Operands, - DisjointFlagsTy DisjointFlag, DebugLoc DL = {}, + VPRecipeIRFlags::DisjointFlagsTy DisjointFlag, DebugLoc DL = {}, const Twine &Name = "") - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DisjointFlag, DL), + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, + DisjointFlag, DL), Opcode(Opcode), Name(Name.str()) { assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint"); } - VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags, - DebugLoc DL = {}, const Twine &Name = "") - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, - ArrayRef({Ptr, Offset}), Flags, DL), + VPInstruction(VPValue *Ptr, VPValue *Offset, + VPRecipeIRFlags::GEPFlagsTy Flags, DebugLoc DL = {}, + const Twine &Name = "") + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, + ArrayRef({Ptr, Offset}), Flags, + DL), Opcode(VPInstruction::PtrAdd), Name(Name.str()) {} VPInstruction(unsigned Opcode, std::initializer_list Operands, @@ -1346,7 +1359,7 @@ class VPInstruction : public VPRecipeWithIRFlags, VPInstruction *clone() override { SmallVector Operands(operands()); auto *New = new VPInstruction(Opcode, Operands, getDebugLoc(), Name); - New->transferFlags(*this); + *New->getIRFlags() = *getIRFlags(); return New; } @@ -1458,14 +1471,15 @@ class VPIRInstruction : public VPRecipeBase { /// opcode and operands of the recipe. This recipe covers most of the /// traditional vectorization cases where each recipe transforms into a /// vectorized version of itself. -class VPWidenRecipe : public VPRecipeWithIRFlags { +class VPWidenRecipe : public VPSingleDefRecipeWithIRFlags { unsigned Opcode; protected: template VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, iterator_range Operands) - : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} + : VPSingleDefRecipeWithIRFlags(VPDefOpcode, Operands, I), + Opcode(I.getOpcode()) {} public: template @@ -1476,7 +1490,7 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { VPWidenRecipe *clone() override { auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); - R->transferFlags(*this); + *R->getIRFlags() = *getIRFlags(); return R; } @@ -1510,8 +1524,6 @@ class VPWidenRecipe : public VPRecipeWithIRFlags { /// A recipe for widening operations with vector-predication intrinsics with /// explicit vector length (EVL). class VPWidenEVLRecipe : public VPWidenRecipe { - using VPRecipeWithIRFlags::transferFlags; - public: template VPWidenEVLRecipe(Instruction &I, iterator_range Operands, VPValue &EVL) @@ -1520,7 +1532,7 @@ class VPWidenEVLRecipe : public VPWidenRecipe { } VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL) : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) { - transferFlags(W); + *getIRFlags() = *W.getIRFlags(); } ~VPWidenEVLRecipe() override = default; @@ -1556,7 +1568,7 @@ class VPWidenEVLRecipe : public VPWidenRecipe { }; /// VPWidenCastRecipe is a recipe to create vector cast instructions. -class VPWidenCastRecipe : public VPRecipeWithIRFlags { +class VPWidenCastRecipe : public VPSingleDefRecipeWithIRFlags { /// Cast instruction opcode. Instruction::CastOps Opcode; @@ -1566,14 +1578,14 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags { public: VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, CastInst &UI) - : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode), - ResultTy(ResultTy) { + : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), + Opcode(Opcode), ResultTy(ResultTy) { assert(UI.getOpcode() == Opcode && "opcode of underlying cast doesn't match"); } VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode), + : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode), ResultTy(ResultTy) {} ~VPWidenCastRecipe() override = default; @@ -1643,7 +1655,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { }; /// A recipe for widening vector intrinsics. -class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { +class VPWidenIntrinsicRecipe : public VPSingleDefRecipeWithIRFlags { /// ID of the vector intrinsic to widen. Intrinsic::ID VectorIntrinsicID; @@ -1663,7 +1675,8 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, ArrayRef CallArguments, Type *Ty, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), + : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, + CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), MayReadFromMemory(CI.mayReadFromMemory()), MayWriteToMemory(CI.mayWriteToMemory()), @@ -1673,7 +1686,7 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { ArrayRef CallArguments, Type *Ty, bool MayReadFromMemory, bool MayWriteToMemory, bool MayHaveSideEffects, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), + : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), MayReadFromMemory(MayReadFromMemory), MayWriteToMemory(MayWriteToMemory), @@ -1721,28 +1734,35 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { }; /// A recipe for widening Call instructions using library calls. -class VPWidenCallRecipe : public VPRecipeWithIRFlags { +class VPWidenCallRecipe : public VPRecipeBase, public VPRecipeIRFlags { /// Variant stores a pointer to the chosen function. There is a 1:1 mapping /// between a given VF and the chosen vectorized variant, so there will be a /// different VPlan for each VF with a valid variant. Function *Variant; + CallInst *CI; + public: - VPWidenCallRecipe(Value *UV, Function *Variant, + VPWidenCallRecipe(CallInst *CI, Function *Variant, ArrayRef CallArguments, DebugLoc DL = {}) - : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, - *cast(UV)), - Variant(Variant) { + : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments, DL), + VPRecipeIRFlags(*CI), Variant(Variant), CI(CI) { assert( isa(getOperand(getNumOperands() - 1)->getLiveInIRValue()) && "last operand must be the called function"); + for (Type *Ty : getContainedTypes(CI->getType())) { + (void)Ty; + new VPValue(CI, this); + } } + CallInst *getUnderlyingCallInstruction() const { return CI; } + ~VPWidenCallRecipe() override = default; VPWidenCallRecipe *clone() override { - return new VPWidenCallRecipe(getUnderlyingValue(), Variant, - {op_begin(), op_end()}, getDebugLoc()); + return new VPWidenCallRecipe(CI, Variant, {op_begin(), op_end()}, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenCallSC) @@ -1852,7 +1872,7 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe { }; /// A recipe for handling GEP instructions. -class VPWidenGEPRecipe : public VPRecipeWithIRFlags { +class VPWidenGEPRecipe : public VPSingleDefRecipeWithIRFlags { bool isPointerLoopInvariant() const { return getOperand(0)->isDefinedOutsideLoopRegions(); } @@ -1870,7 +1890,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags { public: template VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range Operands) - : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {} + : VPSingleDefRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {} ~VPWidenGEPRecipe() override = default; @@ -1894,7 +1914,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags { /// A recipe to compute the pointers for widened memory accesses of IndexTy for /// all parts. If IsReverse is true, compute pointers for accessing the input in /// reverse order per part. -class VPVectorPointerRecipe : public VPRecipeWithIRFlags, +class VPVectorPointerRecipe : public VPSingleDefRecipeWithIRFlags, public VPUnrollPartAccessor<1> { Type *IndexedTy; bool IsReverse; @@ -1902,8 +1922,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public: VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse, bool IsInBounds, DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), - GEPFlagsTy(IsInBounds), DL), + : VPSingleDefRecipeWithIRFlags( + VPDef::VPVectorPointerSC, ArrayRef(Ptr), + VPRecipeIRFlags::GEPFlagsTy(IsInBounds), DL), IndexedTy(IndexedTy), IsReverse(IsReverse) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -2581,7 +2602,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe { /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeWithIRFlags { +class VPReplicateRecipe : public VPSingleDefRecipeWithIRFlags { /// Indicator if only a single replica per lane is needed. bool IsUniform; @@ -2592,7 +2613,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags { template VPReplicateRecipe(Instruction *I, iterator_range Operands, bool IsUniform, VPValue *Mask = nullptr) - : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), + : VPSingleDefRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I), IsUniform(IsUniform), IsPredicated(Mask) { if (Mask) addOperand(Mask); @@ -2604,7 +2625,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags { auto *Copy = new VPReplicateRecipe(getUnderlyingInstr(), operands(), IsUniform, isPredicated() ? getMask() : nullptr); - Copy->transferFlags(*this); + *Copy->getIRFlags() = *getIRFlags(); return Copy; } @@ -3233,15 +3254,15 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their scalar values. -class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, +class VPScalarIVStepsRecipe : public VPSingleDefRecipeWithIRFlags, public VPUnrollPartAccessor<2> { Instruction::BinaryOps InductionOpcode; public: VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step, Instruction::BinaryOps Opcode, FastMathFlags FMFs) - : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, - ArrayRef({IV, Step}), FMFs), + : VPSingleDefRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, + ArrayRef({IV, Step}), FMFs), InductionOpcode(Opcode) {} VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 3eb5f3f40f842..c2f5649c3622d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -136,9 +136,14 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) { llvm_unreachable("Unhandled opcode!"); } -Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { - auto &CI = *cast(R->getUnderlyingInstr()); - return CI.getType(); +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R, + const VPValue *V) { + auto &CI = *cast(R->getUnderlyingCallInstruction()); + for (auto [I, Ty] : enumerate(getContainedTypes(CI.getType()))) { + if (R->getVPValue(I) == V) + return Ty; + } + llvm_unreachable("Unexpected call value!"); } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { @@ -267,12 +272,13 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return inferScalarType(R->getOperand(0)); }) .Case( + VPReplicateRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([](const VPWidenIntrinsicRecipe *R) { return R->getResultType(); }) + .Case( + [&](const auto *R) { return inferScalarTypeForRecipe(R, V); }) .Case([V](const VPInterleaveRecipe *R) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index cc21870bee2e3..140e5ac3359b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -47,7 +47,7 @@ class VPTypeAnalysis { Type *inferScalarTypeForRecipe(const VPBlendRecipe *R); Type *inferScalarTypeForRecipe(const VPInstruction *R); - Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R, const VPValue *V); Type *inferScalarTypeForRecipe(const VPWidenRecipe *R); Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R); Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6fe30356e8c91..80bcc91192cde 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -294,6 +294,8 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast(this)) UI = &WidenMem->getIngredient(); + else if (auto *WidenCall = dyn_cast(this)) + UI = WidenCall->getUnderlyingCallInstruction(); InstructionCost RecipeCost; if (UI && Ctx.skipCostComputation(UI, VF.isVector())) { @@ -329,7 +331,7 @@ InstructionCost VPSingleDefRecipe::computeCost(ElementCount VF, return UI ? Ctx.getLegacyCost(UI, VF) : 0; } -FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { +FastMathFlags VPRecipeIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); FastMathFlags Res; @@ -361,8 +363,8 @@ unsigned VPUnrollPartAccessor::getUnrollPart(VPUser &U) const { VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL, const Twine &Name) - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef({A, B}), - Pred, DL), + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, + ArrayRef({A, B}), Pred, DL), Opcode(Opcode), Name(Name.str()) { assert(Opcode == Instruction::ICmp && "only ICmp predicates supported at the moment"); @@ -371,7 +373,7 @@ VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, VPInstruction::VPInstruction(unsigned Opcode, std::initializer_list Operands, FastMathFlags FMFs, DebugLoc DL, const Twine &Name) - : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), + : VPSingleDefRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), Opcode(Opcode), Name(Name.str()) { // Make sure the VPInstruction is a floating-point operation. assert(isFPMathOp() && "this op can't take fast-math flags"); @@ -838,7 +840,10 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, } printFlags(O); - printOperands(O, SlotTracker); + if (getNumOperands() > 0) { + O << " "; + printOperands(O, SlotTracker); + } if (auto DL = getDebugLoc()) { O << ", !dbg "; @@ -912,7 +917,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { assert(Variant != nullptr && "Can't create vector function."); - auto *CI = cast_or_null(getUnderlyingValue()); + auto *CI = getUnderlyingCallInstruction(); SmallVector OpBundles; if (CI) CI->getOperandBundlesAsDefs(OpBundles); @@ -920,8 +925,16 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { CallInst *V = State.Builder.CreateCall(Variant, Args, OpBundles); setFlags(V); - if (!V->getType()->isVoidTy()) - State.set(this, V); + if (!V->getType()->isVoidTy()) { + if (getNumDefinedValues() > 1) { + for (auto [I, Def] : enumerate(definedValues())) { + Value *AggV = State.Builder.CreateExtractValue(V, I); + State.set(Def, AggV); + } + } else { + State.set(getVPSingleValue(), V); + } + } State.addMetadata(V, CI); } @@ -942,7 +955,9 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, if (CalledFn->getReturnType()->isVoidTy()) O << "void "; else { - printAsOperand(O, SlotTracker); + interleaveComma(definedValues(), O, [&O, &SlotTracker](VPValue *Def) { + Def->printAsOperand(O, SlotTracker); + }); O << " = "; } @@ -1026,7 +1041,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = ToVectorTy(Ctx.Types.inferScalarType(this), VF); + Type *RetTy = ToWideTy(Ctx.Types.inferScalarType(this), VF); SmallVector ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( @@ -1065,7 +1080,7 @@ void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, O << "call"; printFlags(O); - O << getIntrinsicName() << "("; + O << " " << getIntrinsicName() << "("; interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); @@ -1193,8 +1208,7 @@ void VPWidenSelectRecipe::execute(VPTransformState &State) { State.addMetadata(Sel, dyn_cast_or_null(getUnderlyingValue())); } -VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( - const FastMathFlags &FMF) { +VPRecipeIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) { AllowReassoc = FMF.allowReassoc(); NoNaNs = FMF.noNaNs(); NoInfs = FMF.noInfs(); @@ -1205,7 +1219,7 @@ VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { +void VPRecipeIRFlags::printFlags(raw_ostream &O) const { switch (OpType) { case OperationType::Cmp: O << " " << CmpInst::getPredicateName(getPredicate()); @@ -1238,8 +1252,6 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { case OperationType::Other: break; } - if (getNumOperands() > 0) - O << " "; } #endif @@ -1446,7 +1458,10 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); - printOperands(O, SlotTracker); + if (getNumOperands() > 0) { + O << " "; + printOperands(O, SlotTracker); + } } void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, @@ -1455,7 +1470,10 @@ void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = vp." << Instruction::getOpcodeName(getOpcode()); printFlags(O); - printOperands(O, SlotTracker); + if (getNumOperands() > 0) { + O << " "; + printOperands(O, SlotTracker); + } } #endif @@ -1477,9 +1495,12 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); - O << " = " << Instruction::getOpcodeName(Opcode) << " "; + O << " = " << Instruction::getOpcodeName(Opcode); printFlags(O); - printOperands(O, SlotTracker); + if (getNumOperands() > 0) { + O << " "; + printOperands(O, SlotTracker); + } O << " to " << *getResultType(); } #endif @@ -1863,6 +1884,7 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, printAsOperand(O, SlotTracker); O << " = getelementptr"; printFlags(O); + O << " "; printOperands(O, SlotTracker); } #endif @@ -2186,7 +2208,7 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, if (auto *CB = dyn_cast(getUnderlyingInstr())) { O << "call"; printFlags(O); - O << "@" << CB->getCalledFunction()->getName() << "("; + O << " @" << CB->getCalledFunction()->getName() << "("; interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)), O, [&O, &SlotTracker](VPValue *Op) { Op->printAsOperand(O, SlotTracker); @@ -2195,7 +2217,10 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, } else { O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()); printFlags(O); - printOperands(O, SlotTracker); + if (getNumOperands() > 0) { + O << " "; + printOperands(O, SlotTracker); + } } if (shouldPack()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index faec08cac1875..b11d058c9e82e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -843,8 +843,9 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { continue; for (VPUser *U : collectUsersRecursively(PhiR)) - if (auto *RecWithFlags = dyn_cast(U)) { - RecWithFlags->dropPoisonGeneratingFlags(); + if (auto *R = dyn_cast(U)) { + if (auto *IRFlags = R->getIRFlags()) + IRFlags->dropPoisonGeneratingFlags(); } } } @@ -1092,8 +1093,8 @@ void VPlanTransforms::truncateToMinimalBitwidths( // Any wrapping introduced by shrinking this operation shouldn't be // considered undefined behavior. So, we can't unconditionally copy // arithmetic wrapping flags to VPW. - if (auto *VPW = dyn_cast(&R)) - VPW->dropPoisonGeneratingFlags(); + if (auto *Flags = R.getIRFlags()) + Flags->dropPoisonGeneratingFlags(); using namespace llvm::VPlanPatternMatch; if (OldResSizeInBits != NewResSizeInBits && @@ -1529,7 +1530,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // This recipe contributes to the address computation of a widen // load/store. If the underlying instruction has poison-generating flags, // drop them directly. - if (auto *RecWithFlags = dyn_cast(CurRec)) { + if (auto *Flags = CurRec->getIRFlags()) { VPValue *A, *B; using namespace llvm::VPlanPatternMatch; // Dropping disjoint from an OR may yield incorrect results, as some @@ -1537,25 +1538,25 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // for dependence analysis). Instead, replace it with an equivalent Add. // This is possible as all users of the disjoint OR only access lanes // where the operands are disjoint or poison otherwise. - if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) && - RecWithFlags->isDisjoint()) { - VPBuilder Builder(RecWithFlags); + if (match(CurRec, m_BinaryOr(m_VPValue(A), m_VPValue(B))) && + Flags->isDisjoint()) { + VPValue *OldValue = CurRec->getVPSingleValue(); + VPBuilder Builder(CurRec); VPInstruction *New = Builder.createOverflowingOp( - Instruction::Add, {A, B}, {false, false}, - RecWithFlags->getDebugLoc()); - New->setUnderlyingValue(RecWithFlags->getUnderlyingValue()); - RecWithFlags->replaceAllUsesWith(New); - RecWithFlags->eraseFromParent(); + Instruction::Add, {A, B}, {false, false}, CurRec->getDebugLoc()); + New->setUnderlyingValue(OldValue->getUnderlyingValue()); + OldValue->replaceAllUsesWith(New); + CurRec->eraseFromParent(); CurRec = New; } else - RecWithFlags->dropPoisonGeneratingFlags(); + Flags->dropPoisonGeneratingFlags(); } else { Instruction *Instr = dyn_cast_or_null( CurRec->getVPSingleValue()->getUnderlyingValue()); (void)Instr; assert((!Instr || !Instr->hasPoisonGeneratingFlags()) && "found instruction with poison generating flags not covered by " - "VPRecipeWithIRFlags"); + "without VPRecipeIRFlags"); } // Add new definitions to the worklist. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll index 9be068ce880ea..f6f42338959e3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-vplan.ll @@ -64,7 +64,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: CLONE [[GEP_IDX:.*]] = getelementptr inbounds ir<%indices>, [[STEPS]] ; CHECK-NEXT: [[VECP_IDX:vp.*]] = vector-pointer [[GEP_IDX]] ; CHECK-NEXT: WIDEN [[IDX:.*]] = load [[VECP_IDX]] -; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 +; CHECK-NEXT: WIDEN-CAST [[EXT_IDX:.*]] = zext [[IDX]] to i64 ; CHECK-NEXT: WIDEN-GEP Inv[Var] [[GEP_BUCKET:.*]] = getelementptr inbounds ir<%buckets>, [[EXT_IDX]] ; CHECK-NEXT: WIDEN-HISTOGRAM buckets: [[GEP_BUCKET]], inc: ir<1> ; CHECK-NEXT: EMIT [[IV_NEXT]] = add nuw [[IV]], [[VFxUF]] diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll new file mode 100644 index 0000000000000..cf87ad01fcfc8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -0,0 +1,202 @@ +; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + +; Tests basic vectorization of homogeneous struct literal returns. + +; CHECK-REMARKS-COUNT-3: remark: {{.*}} vectorized loop +; CHECK-REMARKS-COUNT-2: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized +; CHECK-REMARKS: remark: {{.*}} loop not vectorized: call instruction cannot be vectorized + +define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @struct_return_f32_widen +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1 +; CHECK: store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4 +; CHECK: store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4 +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %in_val) #0 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @struct_return_f64_widen +; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; CHECK: vector.body: +; CHECK: [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]]) +; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0 +; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1 +; CHECK: store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8 +; CHECK: store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8 +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv + %in_val = load double, ptr %arrayidx, align 8 + %call = tail call { double, double } @bar(double %in_val) #1 + %extract_a = extractvalue { double, double } %call, 0 + %extract_b = extractvalue { double, double } %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv + store double %extract_a, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv + store double %extract_b, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { +; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks +; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) +; CHECK: entry: +; CHECK: br i1 false, label %scalar.ph, label %vector.memcheck +; CHECK: vector.memcheck: +; CHECK: vector.body: +; CHECK: call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) +; CHECK: for.body: +; CHECK call { float, float } @foo(float [[LOAD:%.*]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %in_val) #0 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; Negative test. Widening structs with mixed element types is not supported. +define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @negative_mixed_element_type_struct_return +; CHECK-NOT: vector.body: +; CHECK-NOT: call {{.*}} @fixed_vec_baz +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, i32 } @baz(float %in_val) #2 + %extract_a = extractvalue { float, i32 } %call, 0 + %extract_b = extractvalue { float, i32 } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %out_b, i64 %iv + store i32 %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +%named_struct = type { double, double } + +; Negative test. Widening non-literal structs is not supported. +define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @test_named_struct_return +; CHECK-NOT: vector.body: +; CHECK-NOT: call {{.*}} @fixed_vec_bar +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %iv + %in_val = load double, ptr %arrayidx, align 8 + %call = tail call %named_struct @bar_named(double %in_val) #3 + %extract_a = extractvalue %named_struct %call, 0 + %extract_b = extractvalue %named_struct %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %iv + store double %extract_a, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %iv + store double %extract_b, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable. +define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: define void @test_overflow_intrinsic +; CHECK-NOT: vector.body: +; CHECK-NOT: @llvm.sadd.with.overflow.v{{.+}}i32 +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load i32, ptr %arrayidx, align 4 + %call = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %in_val, i32 %in_val) + %extract_ret = extractvalue { i32, i1 } %call, 0 + %extract_overflow = extractvalue { i32, i1 } %call, 1 + %zext_overflow = zext i1 %extract_overflow to i8 + %arrayidx2 = getelementptr inbounds i32, ptr %out_a, i64 %iv + store i32 %extract_ret, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i8, ptr %out_b, i64 %iv + store i8 %zext_overflow, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare { float, float } @foo(float) +declare { double, double } @bar(double) +declare { float, i32 } @baz(float) +declare %named_struct @bar_named(double) + +declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>) +declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>) +declare { <2 x float>, <2 x i32> } @fixed_vec_baz(<2 x float>) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar)" } +attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_baz(fixed_vec_baz)" } +attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar_named(fixed_vec_bar)" } diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll new file mode 100644 index 0000000000000..7e6d341fd5569 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-struct-return.ll @@ -0,0 +1,58 @@ +; REQUIRES: asserts +; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-width=2 -force-vector-interleave=1 -debug-only=loop-vectorize -disable-output -S 2>&1 | FileCheck %s + +define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; CHECK-LABEL: LV: Checking a loop in 'struct_return_f32_widen' +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF * UF +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%7> +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%in>, vp<%3> +; CHECK-NEXT: vp<%4> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%in_val> = load vp<%4> +; CHECK-NEXT: WIDEN-CALL ir<%call>, ir<%call>.1 = call @foo(ir<%in_val>) (using library function: fixed_vec_foo) +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%out_a>, vp<%3> +; CHECK-NEXT: vp<%5> = vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN store vp<%5>, ir<%call> +; CHECK-NEXT: CLONE ir<%arrayidx4> = getelementptr inbounds ir<%out_b>, vp<%3> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%arrayidx4> +; CHECK-NEXT: WIDEN store vp<%6>, ir<%call>.1 +; CHECK-NEXT: EMIT vp<%7> = add nuw vp<%2>, vp<%0> +; CHECK-NEXT: EMIT branch-on-count vp<%7>, vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %iv + %in_val = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %in_val) #0 + %extract_a = extractvalue { float, float } %call, 0 + %extract_b = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %iv + store float %extract_a, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %iv + store float %extract_b, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare { float, float } @foo(float) + +declare { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float>) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_foo(fixed_vec_foo)" } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 376b00224eb57..02126bdd3e5c7 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -907,7 +907,7 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) { EXPECT_TRUE(isa(BaseR)); EXPECT_EQ(&Recipe, BaseR); - VPValue *VPV = &Recipe; + VPValue *VPV = Recipe.getVPSingleValue(); EXPECT_TRUE(VPV->getDefiningRecipe()); EXPECT_EQ(&Recipe, VPV->getDefiningRecipe());