Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 57 additions & 47 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7992,9 +7992,9 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}

VPWidenMemoryRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VFRange &Range) {
VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
VFRange &Range) {
Instruction *I = VPI->getUnderlyingInstr();
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
"Must be called with either a load or store");

Expand All @@ -8016,7 +8016,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,

VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
Mask = getBlockInMask(Builder.getInsertBlock());
Mask = VPI->getMask();

// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
Expand All @@ -8026,7 +8026,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
bool Consecutive =
Reverse || Decision == LoopVectorizationCostModel::CM_Widen;

VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
VPValue *Ptr = isa<LoadInst>(I) ? VPI->getOperand(0) : VPI->getOperand(1);
if (Consecutive) {
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
Expand Down Expand Up @@ -8055,9 +8055,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
VPIRMetadata(*Load, LVer), I->getDebugLoc());

StoreInst *Store = cast<StoreInst>(I);
return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
Reverse, VPIRMetadata(*Store, LVer),
I->getDebugLoc());
return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
Consecutive, Reverse,
VPIRMetadata(*Store, LVer), I->getDebugLoc());
}

/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
Expand Down Expand Up @@ -8136,9 +8136,9 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
return nullptr;
}

VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ArrayRef<VPValue *> Operands,
VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
VFRange &Range) {
CallInst *CI = cast<CallInst>(VPI->getUnderlyingInstr());
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
Expand All @@ -8155,7 +8155,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;

SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
SmallVector<VPValue *> Operands(VPI->operands());
SmallVector<VPValue *, 4> Ops(ArrayRef(Operands).take_front(CI->arg_size()));

// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
Expand Down Expand Up @@ -8201,6 +8202,9 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
},
Range);
if (ShouldUseVectorCall) {
VPValue *Mask = nullptr;
if (VPI->isMasked())
Mask = Operands.pop_back_val();
if (MaskPos.has_value()) {
// We have 2 cases that would require a mask:
// 1) The block needs to be predicated, either due to a conditional
Expand All @@ -8209,10 +8213,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// 2) No mask is required for the block, but the only available
// vector variant at this VF requires a mask, so we synthesize an
// all-true mask.
VPValue *Mask = nullptr;
if (Legal->isMaskRequired(CI))
Mask = getBlockInMask(Builder.getInsertBlock());
else
if (!Legal->isMaskRequired(CI))
Mask = Plan.getOrAddLiveIn(
ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));

Expand Down Expand Up @@ -8240,20 +8241,22 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
Range);
}

VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
ArrayRef<VPValue *> Operands) {
switch (I->getOpcode()) {
VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPInstruction *VPI) {
ArrayRef<VPValue *> Operands(VPI->operands());
switch (VPI->getOpcode()) {
default:
return nullptr;
case Instruction::SDiv:
case Instruction::UDiv:
case Instruction::SRem:
case Instruction::URem: {
VPValue *Mask = Operands.back();
if (VPI->isMasked())
Operands = Operands.drop_back();
// If not provably safe, use a select to form a safe divisor before widening the
// div/rem operation itself. Otherwise fall through to general handling below.
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands);
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
VPValue *One =
Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Expand Down Expand Up @@ -8318,24 +8321,23 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
};
}

VPHistogramRecipe *
VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands) {
VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
VPInstruction *VPI) {
// FIXME: Support other operations.
unsigned Opcode = HI->Update->getOpcode();
assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
"Histogram update operation must be an Add or Sub");

SmallVector<VPValue *, 3> HGramOps;
// Bucket address.
HGramOps.push_back(Operands[1]);
HGramOps.push_back(VPI->getOperand(1));
// Increment value.
HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));

// In case of predicated execution (due to tail-folding, or conditional
// execution, or both), pass the relevant mask.
if (Legal->isMaskRequired(HI->Store))
HGramOps.push_back(getBlockInMask(Builder.getInsertBlock()));
HGramOps.push_back(VPI->getMask());

return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
}
Expand Down Expand Up @@ -8567,6 +8569,10 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
return PhiRecipe;
}

auto *VPI = cast<VPInstruction>(R);
if (VPI->isMasked())
Operands.pop_back();

if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
cast<TruncInst>(Instr), Operands, Range)))
return Recipe;
Expand All @@ -8576,18 +8582,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
[&](ElementCount VF) { return VF.isScalar(); }, Range))
return nullptr;

if (auto *CI = dyn_cast<CallInst>(Instr))
return tryToWidenCall(CI, Operands, Range);
if (VPI->getOpcode() == Instruction::Call)
return tryToWidenCall(VPI, Range);

if (StoreInst *SI = dyn_cast<StoreInst>(Instr))
if (auto HistInfo = Legal->getHistogramInfo(SI))
return tryToWidenHistogram(*HistInfo, Operands);
return tryToWidenHistogram(*HistInfo, VPI);

if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
if (VPI->getOpcode() == Instruction::Load ||
VPI->getOpcode() == Instruction::Store)
return tryToWidenMemory(VPI, Range);

if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
return tryToCreatePartialReduction(VPI, ScaleFactor.value());

if (!shouldWiden(Instr, Range))
return nullptr;
Expand All @@ -8600,51 +8607,48 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
}

if (auto *CI = dyn_cast<CastInst>(Instr)) {
return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
*CI);
return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
CI->getType(), *CI);
}

return tryToWiden(Instr, Operands);
return tryToWiden(Instr, VPI);
}

VPRecipeBase *
VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
ArrayRef<VPValue *> Operands,
VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
unsigned ScaleFactor) {
assert(Operands.size() == 2 &&
"Unexpected number of operands for partial reduction");

VPValue *BinOp = Operands[0];
VPValue *Accumulator = Operands[1];
VPValue *BinOp = Reduction->getOperand(0);
VPValue *Accumulator = Reduction->getOperand(0);
VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);

unsigned ReductionOpcode = Reduction->getOpcode();
auto *ReductionI = Reduction->getUnderlyingInstr();
if (ReductionOpcode == Instruction::Sub) {
auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
auto *const Zero = ConstantInt::get(ReductionI->getType(), 0);
SmallVector<VPValue *, 2> Ops;
Ops.push_back(Plan.getOrAddLiveIn(Zero));
Ops.push_back(BinOp);
BinOp = new VPWidenRecipe(*Reduction, Ops);
BinOp = new VPWidenRecipe(*ReductionI, Ops);
Builder.insert(BinOp->getDefiningRecipe());
ReductionOpcode = Instruction::Add;
}

VPValue *Cond = nullptr;
if (CM.blockNeedsPredicationForAnyReason(Reduction->getParent())) {
if (Reduction->isMasked()) {
assert((ReductionOpcode == Instruction::Add ||
ReductionOpcode == Instruction::Sub) &&
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
Cond = Reduction->getMask();
VPValue *Zero =
Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
Plan.getOrAddLiveIn(ConstantInt::get(ReductionI->getType(), 0));
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
ScaleFactor, Reduction);
ScaleFactor, ReductionI);
}

void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
Expand Down Expand Up @@ -9067,8 +9071,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
// Only create recipe for the final invariant store of the reduction.
if (Legal->isInvariantStoreOfReduction(SI)) {
auto Ops = R.operands();
if (cast<VPInstruction>(R).isMasked())
Ops = drop_end(Ops);
auto *Recipe =
new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
new VPReplicateRecipe(SI, Ops, true /* IsUniform */,
nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
Recipe->insertBefore(*MiddleVPBB, MBIP);
}
Expand All @@ -9080,6 +9087,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
RecipeBuilder.tryToCreateWidenRecipe(SingleDef, Range);
if (!Recipe) {
SmallVector<VPValue *, 4> Operands(R.operands());
if (cast<VPInstruction>(R).isMasked()) {
Operands.pop_back();
}
Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
}

Expand Down
16 changes: 6 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,10 @@ class VPRecipeBuilder {
/// Range. The function should not be called for memory instructions or calls.
bool shouldWiden(Instruction *I, VFRange &Range) const;

/// Check if the load or store instruction \p I should widened for \p
/// Check if the load or store instruction \p VPI should widened for \p
/// Range.Start and potentially masked. Such instructions are handled by a
/// recipe that takes an additional VPInstruction for the mask.
VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range);
VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);

/// Check if an induction recipe should be constructed for \p Phi. If so build
/// and return it. If not, return null.
Expand All @@ -114,20 +112,19 @@ class VPRecipeBuilder {
/// Handle call instructions. If \p CI can be widened for \p Range.Start,
/// return a new VPWidenCallRecipe or VPWidenIntrinsicRecipe. Range.End may be
/// decreased to ensure same decision from \p Range.Start to \p Range.End.
VPSingleDefRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
VFRange &Range);
VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range);

/// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
/// if it can. The function should only be called if the cost-model indicates
/// that widening should be performed.
VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef<VPValue *> Operands);
VPWidenRecipe *tryToWiden(Instruction *I, VPInstruction *VPI);

/// Makes Histogram count operations safe for vectorization, by emitting a
/// llvm.experimental.vector.histogram.add intrinsic in place of the
/// Load + Add|Sub + Store operations that perform the histogram in the
/// original scalar loop.
VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
ArrayRef<VPValue *> Operands);
VPInstruction *VPI);

/// Examines reduction operations to see if the target can use a cheaper
/// operation with a wider per-iteration input VF and narrower PHI VF.
Expand Down Expand Up @@ -170,8 +167,7 @@ class VPRecipeBuilder {

/// Create and return a partial reduction recipe for a reduction instruction
/// along with binary operation and reduction phi operands.
VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
ArrayRef<VPValue *> Operands,
VPRecipeBase *tryToCreatePartialReduction(VPInstruction *Reduction,
unsigned ScaleFactor);

/// Set the recipe created for given ingredient.
Expand Down
42 changes: 42 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
/// value for lane \p Lane.
Value *generatePerLane(VPTransformState &State, const VPLane &Lane);

#if !defined(NDEBUG)
/// Return the number of operands determined by the opcode of the
/// VPInstruction. Returns -1 if the number of operands cannot be determined
/// directly by the opcode.
unsigned getNumOperandsForOpcode() const;
#endif

public:
VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
const Twine &Name = "")
Expand Down Expand Up @@ -1029,6 +1036,41 @@ class VPInstruction : public VPRecipeWithIRFlags,
}
}

bool isMasked() const {
return getNumOperandsForOpcode() + 1 == getNumOperands();
}

bool needsMask() const {
if (getNumOperandsForOpcode() == -1u)
return false;
if (Opcode == VPInstruction::BranchOnCond ||
Opcode == VPInstruction::BranchOnCount ||
Opcode == VPInstruction::Not || Opcode == Instruction::ExtractValue ||
Opcode == Instruction::FNeg)
return false;

switch (Opcode) {
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::UDiv:
case Instruction::URem:
return true;
default:
return mayReadFromMemory() || mayWriteToMemory() || mayHaveSideEffects();
}
}

void addMask(VPValue *Mask) {
if (!needsMask())
return;
assert(!isMasked() && "recipe is already masked");
addOperand(Mask);
}

VPValue *getMask() const {
return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
}

/// Returns true if the underlying opcode may read from or write to memory.
bool opcodeMayReadOrWriteFromMemory() const;

Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
// other operands match and cache them.
auto SetResultTyFromOp = [this, R]() {
Type *ResTy = inferScalarType(R->getOperand(0));
for (unsigned Op = 1; Op != R->getNumOperands(); ++Op) {
unsigned NumOperands =
R->isMasked() ? R->getNumOperands() - 1 : R->getNumOperands();
for (unsigned Op = 1; Op != NumOperands; ++Op) {
VPValue *OtherV = R->getOperand(Op);
assert(inferScalarType(OtherV) == ResTy &&
"different types inferred for different operands");
Expand Down
Loading
Loading