Skip to content

Commit cc6fcd3

Browse files
committed
[LV] Add initial support for vectorizing literal struct return values
This patch adds initial support for vectorizing literal struct return values. Currently, this is limited to the case where the struct is homogeneous (all elements have the same type) and not packed. The users of the call also must all be `extractvalue` instructions. The intended use case for this is vectorizing intrinsics such as: ``` declare { float, float } @llvm.sincos.f32(float %x) ``` Mapping them to structure-returning library calls such as: ``` declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>) ``` Or their widened form (such as `@llvm.sincos.v4f32` in this case). Implementing this required two main changes: 1. Supporting widening `extractvalue` 2. Adding support for vectorized struct types in LV * This is mostly limited to parts of the cost model and scalarization Since the supported use case is narrow, the required changes are relatively small.
1 parent b2769c0 commit cc6fcd3

File tree

14 files changed

+580
-103
lines changed

14 files changed

+580
-103
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1473,6 +1473,12 @@ class TargetTransformInfo {
14731473
TTI::TargetCostKind CostKind,
14741474
unsigned Index = -1) const;
14751475

1476+
/// \return The expected cost of aggregate inserts and extracts. This is
1477+
/// used when the instruction is not available; a typical use case is to
1478+
/// provision the cost of vectorization/scalarization in vectorizer passes.
1479+
InstructionCost getInsertExtractValueCost(unsigned Opcode,
1480+
TTI::TargetCostKind CostKind) const;
1481+
14761482
/// \return The cost of replication shuffle of \p VF elements typed \p EltTy
14771483
/// \p ReplicationFactor times.
14781484
///
@@ -2205,6 +2211,9 @@ class TargetTransformInfo::Concept {
22052211
const APInt &DemandedDstElts,
22062212
TTI::TargetCostKind CostKind) = 0;
22072213

2214+
virtual InstructionCost
2215+
getInsertExtractValueCost(unsigned Opcode, TTI::TargetCostKind CostKind) = 0;
2216+
22082217
virtual InstructionCost
22092218
getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
22102219
unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -2926,6 +2935,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
29262935
return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
29272936
DemandedDstElts, CostKind);
29282937
}
2938+
InstructionCost
2939+
getInsertExtractValueCost(unsigned Opcode,
2940+
TTI::TargetCostKind CostKind) override {
2941+
return Impl.getInsertExtractValueCost(Opcode, CostKind);
2942+
}
29292943
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
29302944
unsigned AddressSpace,
29312945
TTI::TargetCostKind CostKind,

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,17 @@ class TargetTransformInfoImplBase {
745745
return 1;
746746
}
747747

748+
InstructionCost
749+
getInsertExtractValueCost(unsigned Opcode,
750+
TTI::TargetCostKind CostKind) const {
751+
// Note: The `insertvalue` cost here is chosen to match the default case of
752+
// getInstructionCost() -- as pior to adding this helper `insertvalue` was
753+
// not handled.
754+
if (Opcode == Instruction::InsertValue)
755+
return CostKind == TTI::TCK_RecipThroughput ? -1 : TTI::TCC_Basic;
756+
return TTI::TCC_Free;
757+
}
758+
748759
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
749760
unsigned AddressSpace,
750761
TTI::TargetCostKind CostKind,
@@ -1296,9 +1307,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
12961307
case Instruction::PHI:
12971308
case Instruction::Switch:
12981309
return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
1299-
case Instruction::ExtractValue:
13001310
case Instruction::Freeze:
13011311
return TTI::TCC_Free;
1312+
case Instruction::ExtractValue:
1313+
case Instruction::InsertValue:
1314+
return TargetTTI->getInsertExtractValueCost(Opcode, CostKind);
13021315
case Instruction::Alloca:
13031316
if (cast<AllocaInst>(U)->isStaticAlloca())
13041317
return TTI::TCC_Free;

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -416,10 +416,6 @@ class LoopVectorizationLegality {
416416
/// has a vectorized variant available.
417417
bool hasVectorCallVariants() const { return VecCallVariantsFound; }
418418

419-
/// Returns true if there is at least one function call in the loop which
420-
/// returns a struct type and needs to be vectorized.
421-
bool hasStructVectorCall() const { return StructVecCallFound; }
422-
423419
unsigned getNumStores() const { return LAI->getNumStores(); }
424420
unsigned getNumLoads() const { return LAI->getNumLoads(); }
425421

@@ -639,12 +635,6 @@ class LoopVectorizationLegality {
639635
/// the use of those function variants.
640636
bool VecCallVariantsFound = false;
641637

642-
/// If we find a call (to be vectorized) that returns a struct type, record
643-
/// that so we can bail out until this is supported.
644-
/// TODO: Remove this flag once vectorizing calls with struct returns is
645-
/// supported.
646-
bool StructVecCallFound = false;
647-
648638
/// Keep track of all the countable and uncountable exiting blocks if
649639
/// the exact backedge taken count is not computable.
650640
SmallVector<BasicBlock *, 4> CountableExitingBlocks;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,16 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
11131113
return Cost;
11141114
}
11151115

1116+
InstructionCost TargetTransformInfo::getInsertExtractValueCost(
1117+
unsigned Opcode, TTI::TargetCostKind CostKind) const {
1118+
assert((Opcode == Instruction::InsertValue ||
1119+
Opcode == Instruction::ExtractValue) &&
1120+
"Expecting Opcode to be insertvalue/extractvalue.");
1121+
InstructionCost Cost = TTIImpl->getInsertExtractValueCost(Opcode, CostKind);
1122+
assert(Cost >= 0 && "TTI should not produce negative costs!");
1123+
return Cost;
1124+
}
1125+
11161126
InstructionCost TargetTransformInfo::getReplicationShuffleCost(
11171127
Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
11181128
TTI::TargetCostKind CostKind) const {

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -954,23 +954,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
954954
if (CI && !VFDatabase::getMappings(*CI).empty())
955955
VecCallVariantsFound = true;
956956

957-
auto CanWidenInstructionTy = [this](Instruction const &Inst) {
957+
auto CanWidenInstructionTy = [](Instruction const &Inst) {
958958
Type *InstTy = Inst.getType();
959959
if (!isa<StructType>(InstTy))
960960
return canVectorizeTy(InstTy);
961961

962962
// For now, we only recognize struct values returned from calls where
963963
// all users are extractvalue as vectorizable. All element types of the
964964
// struct must be types that can be widened.
965-
if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966-
all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
967-
// TODO: Remove the `StructVecCallFound` flag once vectorizing calls
968-
// with struct returns is supported.
969-
StructVecCallFound = true;
970-
return true;
971-
}
972-
973-
return false;
965+
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966+
all_of(Inst.users(), IsaPred<ExtractValueInst>);
974967
};
975968

976969
// Check that the instruction return type is vectorizable.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2350,7 +2350,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23502350
VPReplicateRecipe *RepRecipe,
23512351
const VPLane &Lane,
23522352
VPTransformState &State) {
2353-
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2353+
assert((!Instr->getType()->isAggregateType() ||
2354+
canVectorizeTy(Instr->getType())) &&
2355+
"Expected vectorizable or non-aggregate type.");
23542356

23552357
// Does this instruction return a value ?
23562358
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2855,10 +2857,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
28552857
return ScalarCallCost;
28562858
}
28572859

2858-
static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2859-
if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2860-
return Elt;
2861-
return VectorType::get(Elt, VF);
2860+
static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2861+
if (VF.isScalar() || !canVectorizeTy(Ty))
2862+
return Ty;
2863+
return toVectorizedTy(Ty, VF);
28622864
}
28632865

28642866
InstructionCost
@@ -3605,13 +3607,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
36053607
}
36063608
}
36073609

3608-
// ExtractValue instructions must be uniform, because the operands are
3609-
// known to be loop-invariant.
36103610
if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3611-
assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3612-
"Expected aggregate value to be loop invariant");
3613-
AddToWorklistIfAllowed(EVI);
3614-
continue;
3611+
if (IsOutOfScope(EVI->getAggregateOperand())) {
3612+
AddToWorklistIfAllowed(EVI);
3613+
continue;
3614+
}
3615+
// Only ExtractValue instructions where the aggregate value comes from a
3616+
// call are allowed to be non-uniform.
3617+
assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3618+
"Expected aggregate value to be call return value");
36153619
}
36163620

36173621
// If there's no pointer operand, there's nothing to do.
@@ -4492,8 +4496,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44924496
llvm_unreachable("unhandled recipe");
44934497
}
44944498

4495-
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4496-
Type *VectorTy = toVectorTy(ScalarTy, VF);
4499+
auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
44974500
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
44984501
if (!NumLegalParts)
44994502
return false;
@@ -4505,7 +4508,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45054508
// explicitly ask TTI about the register class uses for each part.
45064509
return NumLegalParts <= VF.getKnownMinValue();
45074510
}
4508-
// Two or more parts that share a register - are vectorized.
4511+
// Two or more elements that share a register - are vectorized.
45094512
return NumLegalParts < VF.getKnownMinValue();
45104513
};
45114514

@@ -4524,7 +4527,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45244527
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
45254528
if (!Visited.insert({ScalarTy}).second)
45264529
continue;
4527-
if (WillWiden(ScalarTy))
4530+
Type *WideTy = toVectorizedTy(ScalarTy, VF);
4531+
if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
45284532
return true;
45294533
}
45304534
}
@@ -5481,10 +5485,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
54815485
// Compute the scalarization overhead of needed insertelement instructions
54825486
// and phi nodes.
54835487
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5484-
ScalarCost += TTI.getScalarizationOverhead(
5485-
cast<VectorType>(toVectorTy(I->getType(), VF)),
5486-
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5487-
/*Extract*/ false, CostKind);
5488+
Type *WideTy = toVectorizedTy(I->getType(), VF);
5489+
for (Type *VectorTy : getContainedTypes(WideTy)) {
5490+
ScalarCost += TTI.getScalarizationOverhead(
5491+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5492+
/*Insert=*/true,
5493+
/*Extract=*/false, CostKind);
5494+
}
54885495
ScalarCost +=
54895496
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
54905497
}
@@ -5495,15 +5502,18 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
54955502
// overhead.
54965503
for (Use &U : I->operands())
54975504
if (auto *J = dyn_cast<Instruction>(U.get())) {
5498-
assert(VectorType::isValidElementType(J->getType()) &&
5505+
assert(canVectorizeTy(J->getType()) &&
54995506
"Instruction has non-scalar type");
55005507
if (CanBeScalarized(J))
55015508
Worklist.push_back(J);
55025509
else if (needsExtract(J, VF)) {
5503-
ScalarCost += TTI.getScalarizationOverhead(
5504-
cast<VectorType>(toVectorTy(J->getType(), VF)),
5505-
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5506-
/*Extract*/ true, CostKind);
5510+
Type *WideTy = toVectorizedTy(J->getType(), VF);
5511+
for (Type *VectorTy : getContainedTypes(WideTy)) {
5512+
ScalarCost += TTI.getScalarizationOverhead(
5513+
cast<VectorType>(VectorTy),
5514+
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5515+
/*Extract*/ true, CostKind);
5516+
}
55075517
}
55085518
}
55095519

@@ -5982,13 +5992,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
59825992
return 0;
59835993

59845994
InstructionCost Cost = 0;
5985-
Type *RetTy = toVectorTy(I->getType(), VF);
5995+
Type *RetTy = toVectorizedTy(I->getType(), VF);
59865996
if (!RetTy->isVoidTy() &&
5987-
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5988-
Cost += TTI.getScalarizationOverhead(
5989-
cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
5990-
/*Insert*/ true,
5991-
/*Extract*/ false, CostKind);
5997+
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5998+
5999+
for (Type *VectorTy : getContainedTypes(RetTy)) {
6000+
Cost += TTI.getScalarizationOverhead(
6001+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
6002+
/*Insert=*/true,
6003+
/*Extract=*/false, CostKind);
6004+
}
6005+
}
59926006

59936007
// Some targets keep addresses scalar.
59946008
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6246,9 +6260,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
62466260

62476261
bool MaskRequired = Legal->isMaskRequired(CI);
62486262
// Compute corresponding vector type for return value and arguments.
6249-
Type *RetTy = toVectorTy(ScalarRetTy, VF);
6263+
Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
62506264
for (Type *ScalarTy : ScalarTys)
6251-
Tys.push_back(toVectorTy(ScalarTy, VF));
6265+
Tys.push_back(toVectorizedTy(ScalarTy, VF));
62526266

62536267
// An in-loop reduction using an fmuladd intrinsic is a special case;
62546268
// we don't want the normal cost for that intrinsic.
@@ -6438,7 +6452,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
64386452
HasSingleCopyAfterVectorization(I, VF));
64396453
VectorTy = RetTy;
64406454
} else
6441-
VectorTy = toVectorTy(RetTy, VF);
6455+
VectorTy = toVectorizedTy(RetTy, VF);
64426456

64436457
if (VF.isVector() && VectorTy->isVectorTy() &&
64446458
!TTI.getNumberOfParts(VectorTy))
@@ -8560,7 +8574,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
85608574
case Instruction::Shl:
85618575
case Instruction::Sub:
85628576
case Instruction::Xor:
8563-
case Instruction::Freeze:
8577+
case Instruction::Freeze: {
85648578
SmallVector<VPValue *> NewOps(Operands);
85658579
if (Instruction::isBinaryOp(I->getOpcode())) {
85668580
// The legacy cost model uses SCEV to check if some of the operands are
@@ -8585,6 +8599,16 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
85858599
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
85868600
}
85878601
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8602+
}
8603+
case Instruction::ExtractValue: {
8604+
SmallVector<VPValue *> NewOps(Operands);
8605+
Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
8606+
auto *EVI = cast<ExtractValueInst>(I);
8607+
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
8608+
unsigned Idx = EVI->getIndices()[0];
8609+
NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
8610+
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8611+
}
85888612
};
85898613
}
85908614

@@ -9865,7 +9889,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
98659889
VectorType::get(UI->getType(), State.VF));
98669890
State.set(this, Poison);
98679891
}
9868-
State.packScalarIntoVectorValue(this, *State.Lane);
9892+
State.packScalarIntoVectorizedValue(this, *State.Lane);
98699893
}
98709894
return;
98719895
}
@@ -10382,13 +10406,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1038210406
return false;
1038310407
}
1038410408

10385-
if (LVL.hasStructVectorCall()) {
10386-
reportVectorizationFailure("Auto-vectorization of calls that return struct "
10387-
"types is not yet supported",
10388-
"StructCallVectorizationUnsupported", ORE, L);
10389-
return false;
10390-
}
10391-
1039210409
// Entrance to the VPlan-native vectorization path. Outer loops are processed
1039310410
// here. They may require CFG and instruction level transformations before
1039410411
// even evaluating whether vectorization is profitable. Since we cannot modify

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -334,10 +334,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
334334
} else {
335335
// Initialize packing with insertelements to start from undef.
336336
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
337-
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
337+
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
338338
set(Def, Undef);
339339
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
340-
packScalarIntoVectorValue(Def, Lane);
340+
packScalarIntoVectorizedValue(Def, Lane);
341341
VectorValue = get(Def);
342342
}
343343
Builder.restoreIP(OldIP);
@@ -390,13 +390,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
390390
Builder.SetCurrentDebugLocation(DIL);
391391
}
392392

393-
void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
394-
const VPLane &Lane) {
393+
void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
394+
const VPLane &Lane) {
395395
Value *ScalarInst = get(Def, Lane);
396-
Value *VectorValue = get(Def);
397-
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
398-
Lane.getAsRuntimeExpr(Builder, VF));
399-
set(Def, VectorValue);
396+
Value *WideValue = get(Def);
397+
Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
398+
if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
399+
// We must handle each element of a vectorized struct type.
400+
for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
401+
Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
402+
Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
403+
VectorValue =
404+
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
405+
WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
406+
}
407+
} else {
408+
WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
409+
}
410+
set(Def, WideValue);
400411
}
401412

402413
BasicBlock *

0 commit comments

Comments
 (0)