Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
case Intrinsic::is_fpclass:
case Intrinsic::vp_is_fpclass:
case Intrinsic::powi:
case Intrinsic::vector_extract:
return (ScalarOpdIdx == 1);
case Intrinsic::smul_fix:
case Intrinsic::smul_fix_sat:
Expand Down Expand Up @@ -200,6 +201,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
case Intrinsic::vp_llrint:
case Intrinsic::ucmp:
case Intrinsic::scmp:
case Intrinsic::vector_extract:
return OpdIdx == -1 || OpdIdx == 0;
case Intrinsic::modf:
case Intrinsic::sincos:
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4214,9 +4214,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
}
}
}
[[fallthrough]];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I don't think this is right because previously when falling through it was hitting the VPInstruction::ExplicitVectorLength case below and adding on the cost of the instruction, i.e. C += VPI->cost(VF, CostCtx).

If you change this to

  C += VPI->cost(VF, CostCtx);
  break;

then it will be the same as before.

C += VPI->cost(VF, CostCtx);
break;
}
case VPInstruction::ActiveLaneMask: {
unsigned Multiplier =
cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
->getZExtValue();
C += VPI->cost(VF * Multiplier, CostCtx);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why multiply VF * Multiplier here? Looks like you also multiply it on computeCost?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is also multiplied in computeCost; I have removed these changes.

break;
}
case VPInstruction::ActiveLaneMask:
case VPInstruction::ExplicitVectorLength:
C += VPI->cost(VF, CostCtx);
break;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
Not,
SLPLoad,
SLPStore,
// Creates a mask where each lane is active (true) whilst the current
// counter (first operand + index) is less than the second operand. i.e.
// mask[i] = icmpt ult (op0 + i), op1
// The size of the mask returned is VF * Multiplier (UF, third op).
ActiveLaneMask,
ExplicitVectorLength,
CalculateTripCountMinusVF,
Expand Down Expand Up @@ -1999,6 +2003,9 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe,
return getOperand(1);
}

/// Update the incoming value from the loop backedge.
void setBackedgeValue(VPValue *V) { setOperand(1, V); }

/// Returns the backedge value as a recipe. The backedge value is guaranteed
/// to be a recipe.
virtual VPRecipeBase &getBackedgeRecipe() {
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,10 +316,10 @@ m_ExtractLastElement(const Op0_t &Op0) {
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
}

template <typename Op0_t, typename Op1_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
template <typename Op0_t, typename Op1_t, typename Op2_t>
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}

template <typename Op0_t, typename Op1_t>
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Store:
case VPInstruction::ActiveLaneMask:
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::FirstOrderRecurrenceSplice:
Expand All @@ -481,6 +480,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
case VPInstruction::ActiveLaneMask:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment to ActiveLaneMask definintion in VPlan.h to document the arguments

case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
return 3;
Expand Down Expand Up @@ -620,7 +620,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
Name);

auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);
auto PredTy = VectorType::get(
Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
->getZExtValue());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that given we're now potentially generating a different mask we should update the cost for VPInstruction::ActiveLaneMask in VPInstruction::computeCost if using a wider mask. Again, it's not going to make much difference because the wider mask is generated after the cost model anyway, but good to have it for completeness.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've included the extra operand where we calculate the cost of ActiveLaneMask, both in VPInstruction::computeCost and also in LoopVectorizationPlanner::selectVectorizationFactor. As you mentioned, at this point the multiplier is always 1 and so there were no changes to the existing cost model tests for ActiveLaneMask.

return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
Expand Down Expand Up @@ -1091,7 +1093,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::ActiveLaneMask: {
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
unsigned Multiplier =
cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
{ArgTy, ArgTy});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
Expand Down
129 changes: 117 additions & 12 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
using namespace llvm;
using namespace VPlanPatternMatch;

cl::opt<bool> EnableWideActiveLaneMask(
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
cl::desc("Enable use of wide get active lane mask instructions"));

bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
Expand Down Expand Up @@ -1467,6 +1471,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
}

/// Try to replace multiple active lane masks used for control flow with
/// a single, wide active lane mask instruction followed by multiple
/// extract subvector intrinsics. This applies to the active lane mask
/// instructions both in the loop and in the preheader.
/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
/// new extracts from the first active lane mask, which has it's last
/// operand (multiplier) set to UF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
unsigned UF) {
if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
return false;

VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC this needs to be cost-driven, to only be done when the wider active-lane-mask is profitable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, to enable this without passing the extra flag we will need to decide whether it's profitable based on the cost of the wider mask, taking into account the features available on the target.

VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
auto *Term = &ExitingVPBB->back();

using namespace llvm::VPlanPatternMatch;
if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
m_VPValue(), m_VPValue(), m_VPValue())))))
return false;

auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
LLVMContext &Ctx = Plan.getContext();

auto ExtractFromALM = [&](VPInstruction *ALM,
SmallVectorImpl<VPValue *> &Extracts) {
DebugLoc DL = ALM->getDebugLoc();
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<VPValue *> Ops;
Ops.append({ALM, Plan.getOrAddLiveIn(
ConstantInt::get(IntegerType::getInt64Ty(Ctx),
VF.getKnownMinValue() * Part))});
auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
IntegerType::getInt1Ty(Ctx), DL);
Extracts[Part] = Ext;
Ext->insertAfter(ALM);
}
};

// Create a list of each active lane mask phi, ordered by unroll part.
SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
for (VPRecipeBase &R : Header->phis()) {
auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
if (!Phi)
continue;
VPValue *Index = nullptr;
match(Phi->getBackedgeValue(),
m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
assert(Index && "Expected index from ActiveLaneMask instruction");

auto *II = dyn_cast<VPInstruction>(Index);
if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
Phis[Part->getZExtValue()] = Phi;
} else
// Anything other than a CanonicalIVIncrementForPart is part 0
Phis[0] = Phi;
}

assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
"Expected one VPActiveLaneMaskPHIRecipe for each unroll part");

auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());

assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
"Expected incoming values of Phi to be ActiveLaneMasks");

// When using wide lane masks, the return type of the get.active.lane.mask
// intrinsic is VF x UF (last operand).
VPValue *ALMMultiplier =
Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
EntryALM->setOperand(2, ALMMultiplier);
LoopALM->setOperand(2, ALMMultiplier);

// Create UF x extract vectors and insert into preheader.
SmallVector<VPValue *> EntryExtracts(UF);
ExtractFromALM(EntryALM, EntryExtracts);

// Create UF x extract vectors and insert before the loop compare & branch,
// updating the compare to use the first extract.
SmallVector<VPValue *> LoopExtracts(UF);
ExtractFromALM(LoopALM, LoopExtracts);
VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
Not->setOperand(0, LoopExtracts[0]);

// Update the incoming values of active lane mask phis.
for (unsigned Part = 0; Part < UF; ++Part) {
Phis[Part]->setStartValue(EntryExtracts[Part]);
Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
}

return true;
}

/// Try to simplify the branch condition of \p Plan. This may restrict the
/// resulting plan to \p BestVF and \p BestUF.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
Expand All @@ -1478,8 +1578,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
match(Term, m_BranchOnCond(
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
m_VPValue(), m_VPValue(), m_VPValue()))))) {
// Try to simplify the branch condition if TC <= VF * UF when the latch
// terminator is BranchOnCount or BranchOnCond where the input is
// Not(ActiveLaneMask).
Expand Down Expand Up @@ -1558,8 +1658,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");

bool MadeChange =
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);

if (MadeChange) {
Expand Down Expand Up @@ -2042,9 +2142,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
"index.part.next");

// Create the active lane mask instruction in the VPlan preheader.
auto *EntryALM =
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
DL, "active.lane.mask.entry");
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");

// Now create the ActiveLaneMaskPhi recipe in the main loop using the
// preheader ActiveLaneMask instruction.
Expand All @@ -2059,8 +2161,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{InLoopIncrement, TripCount}, DL,
"active.lane.mask.next");
{InLoopIncrement, TripCount, ALMMultiplier},
DL, "active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);

// Replace the original terminator with BranchOnCond. We have to invert the
Expand Down Expand Up @@ -2139,9 +2241,12 @@ void VPlanTransforms::addActiveLaneMask(
Plan, DataAndControlFlowWithoutRuntimeCheck);
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount()}, nullptr,
"active.lane.mask");
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
LaneMask =
B.createNaryOp(VPInstruction::ActiveLaneMask,
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
nullptr, "active.lane.mask");
}

// Walk users of WideCanonicalIV and replace the header mask of the form
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
VPValue *A, *B;
using namespace VPlanPatternMatch;

if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
return B == Plan.getTripCount() &&
(match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
m_SpecificInt(1),
Expand Down
Loading