Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,12 @@ struct PointerDiffInfo {
const SCEV *SinkStart;
unsigned AccessSize;
bool NeedsFreeze;
bool WriteAfterRead;

PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
unsigned AccessSize, bool NeedsFreeze)
unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
: SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
NeedsFreeze(NeedsFreeze) {}
NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
};

/// Holds information about the memory runtime legality checks to verify
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@ enum class TailFoldingStyle {
DataWithEVL,
};

enum class RTCheckStyle {
/// Create runtime checks based on the difference between two pointers
ScalarDifference,
/// Form a mask based on elements which won't be a WAR or RAW hazard.
UseSafeEltsMask,
};

struct TailFoldingInfo {
TargetLibraryInfo *TLI;
LoopVectorizationLegality *LVL;
Expand Down Expand Up @@ -1357,6 +1364,11 @@ class TargetTransformInfo {
PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const;

/// \return true if a mask should be formed that disables lanes that could
/// alias between two pointers. The mask is created by the
/// loop_dependence_{war,raw}_mask intrinsics.
LLVM_ABI bool useSafeEltsMask() const;

/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,8 @@ class TargetTransformInfoImplBase {
return InstructionCost::getInvalid();
}

virtual bool useSafeEltsMask() const { return false; }

virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }

virtual InstructionCost getArithmeticInstrCost(
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
}
}

bool WriteAfterRead = !Src->IsWritePtr && Sink->IsWritePtr;

LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
<< "SrcStart: " << *SrcStartInt << '\n'
<< "SinkStartInt: " << *SinkStartInt << '\n');
DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
Src->NeedsFreeze || Sink->NeedsFreeze);
Src->NeedsFreeze || Sink->NeedsFreeze,
WriteAfterRead);
return true;
}

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,10 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
BinOp, CostKind);
}

bool TargetTransformInfo::useSafeEltsMask() const {
return TTIImpl->useSafeEltsMask();
}

unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
case Intrinsic::loop_dependence_war_mask:
case Intrinsic::loop_dependence_raw_mask:
return true;
default:
return false;
}
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5880,6 +5880,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost + 2;
}

bool AArch64TTIImpl::useSafeEltsMask() const {
// The whilewr/rw instructions require SVE2
return ST->hasSVE2() || ST->hasSME();
}

InstructionCost
AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
VectorType *SrcTy, ArrayRef<int> Mask,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const override;

bool useSafeEltsMask() const override;

bool enableOrderedReductions() const override { return true; }

InstructionCost getInterleavedMemoryOpCost(
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2149,7 +2149,7 @@ Value *llvm::addDiffRuntimeChecks(
// Map to keep track of created compares, The key is the pair of operands for
// the compare, to allow detecting and re-using redundant compares.
DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze, _] : Checks) {
Type *Ty = SinkStart->getType();
// Compute VF * IC * AccessSize.
auto *VFTimesICTimesSize =
Expand Down
107 changes: 105 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");

static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -1333,6 +1334,12 @@ class LoopVectorizationCostModel {
: ChosenTailFoldingStyle->second;
}

RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const {
if (TTI.useSafeEltsMask())
return RTCheckStyle::UseSafeEltsMask;
return RTCheckStyle::ScalarDifference;
}

/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
/// \param IsScalableVF true if scalable vector factors enabled.
Expand Down Expand Up @@ -8554,6 +8561,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
bool WithoutRuntimeCheck =
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
WithoutRuntimeCheck);
}
Expand Down Expand Up @@ -8974,11 +8982,104 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
assert((!CM.OptForSize ||
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
VPlanTransforms::attachCheckBlock(Plan, Plan.getOrAddLiveIn(SCEVCheckCond),
Plan.createVPIRBasicBlock(SCEVCheckBlock),
HasBranchWeights);
}
const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
VPValue *MemCheckCondVPV = Plan.getOrAddLiveIn(MemCheckCond);
VPBasicBlock *MemCheckBlockVP = Plan.createVPIRBasicBlock(MemCheckBlock);
std::optional<ArrayRef<PointerDiffInfo>> ChecksOpt =
CM.Legal->getRuntimePointerChecking()->getDiffChecks();

// Create a mask enabling safe elements for each iteration.
if (CM.getRTCheckStyle(TTI) == RTCheckStyle::UseSafeEltsMask &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to outline to a separte function + document the transform

ChecksOpt.has_value() && ChecksOpt->size() > 0) {
ArrayRef<PointerDiffInfo> Checks = *ChecksOpt;
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *LoopBody = LoopRegion->getEntryBasicBlock();
VPBuilder Builder(MemCheckBlockVP);

/// Create a mask for each possibly-aliasing pointer pair, ANDing them if
/// there's more than one pair.
VPValue *AliasMask = nullptr;
for (PointerDiffInfo Check : Checks) {
VPValue *Sink =
vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
VPValue *Src =
vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);

Type *PtrType = PointerType::getUnqual(Plan.getContext());
Sink = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink,
PtrType, DebugLoc());
Src = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src,
PtrType, DebugLoc());

SmallVector<VPValue *, 3> Ops{
Src, Sink,
Plan.getConstantInt(IntegerType::getInt64Ty(Plan.getContext()),
Check.AccessSize)};
VPWidenIntrinsicRecipe *M = new VPWidenIntrinsicRecipe(
Check.WriteAfterRead ? Intrinsic::loop_dependence_war_mask
: Intrinsic::loop_dependence_raw_mask,
Ops, IntegerType::getInt1Ty(Plan.getContext()));
MemCheckBlockVP->appendRecipe(M);
if (AliasMask)
AliasMask = Builder.createAnd(AliasMask, M);
else
AliasMask = M;
}
assert(AliasMask && "Expected an alias mask to have been created");

// Replace uses of the loop body's active lane mask phi with an AND of the
// phi and the alias mask.
for (VPRecipeBase &R : *LoopBody) {
auto *MaskPhi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the transform is currently incorrect. When there is no active lane mask, it would create an unpredicated vector loop that handles e.g. VF=16 lanes in a loop, even when the result of the alias.mask would say that only 3 lanes could be safely handled, for example. It would then increment the IV by 3 elements, but that doesn't mean only 3 lanes are handled each iteration. Without predication, it still handles 16 lanes each iteration.

I think there are two options here:

  1. if there is no active lane mask in the loop, we could bail out to the scalar loop if the number of lanes < VF
  2. request the use of an active lane mask in the loop for data when there is an alias mask required and the target supports the use of an active lane mask.

I wouldn't mind taking approach 1 first, so that we can already use the whilerw instructions for the alias checks in the check block, rather than a bunch of scalar instructions, and then follow this up by option 2.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we necessarily need an active-lane-mask, as long as either all recipes that need predication (memory ops, ops that are immediate UB on poison, reduction/recurrences) are already predicated (could be due to tail-folding without active-lane-mask) or we could convert them to predicated variants using the alias mask.

Also, an active-lane-mask also does not necessarily mean all required recipes are predicated and use the active-lane-mask (e.g. a transform may convert a masked memory access to an unmasked one, if it is guaranteed dereferneceable for the whole loop).

So would probably be good to check if all required recipes are masked and make sure their masks inlcude AliasMask after the transform

if (!MaskPhi)
continue;
VPInstruction *And = new VPInstruction(Instruction::BinaryOps::And,
{MaskPhi, AliasMask});
MaskPhi->replaceUsesWithIf(And, [And](VPUser &U, unsigned) {
auto *UR = dyn_cast<VPRecipeBase>(&U);
// If this is the first user, insert the AND.
if (UR && !And->getParent())
And->insertBefore(UR);
bool Replace = UR != And;
return Replace;
});
}

// An empty mask would cause an infinite loop since the induction variable
// is updated with the number of set elements in the mask. Make sure we
// don't execute the vector loop when the mask is empty.
VPInstruction *PopCount =
new VPInstruction(VPInstruction::PopCount, {AliasMask});
PopCount->insertAfter(AliasMask->getDefiningRecipe());
VPValue *Cmp =
Builder.createICmp(CmpInst::Predicate::ICMP_EQ, PopCount,
Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Plan.getContext(), 64), 0)));
MemCheckCondVPV = Cmp;

// Update the IV by the number of active lanes in the mask.
auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

// Increment phi by correct amount.
VPValue *IncrementBy = PopCount;
Type *IVType = CanonicalIVPHI->getScalarType();

if (IVType->getScalarSizeInBits() < 64) {
Builder.setInsertPoint(CanonicalIVIncrement);
IncrementBy =
Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType,
CanonicalIVIncrement->getDebugLoc());
}
CanonicalIVIncrement->setOperand(1, IncrementBy);
}

// VPlan-native path does not do any analysis for runtime checks
// currently.
assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
Expand All @@ -8999,7 +9100,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
"(e.g., adding 'restrict').";
});
}
VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
VPlanTransforms::attachCheckBlock(Plan, MemCheckCondVPV, MemCheckBlockVP,
HasBranchWeights);
}
}
Expand Down Expand Up @@ -9966,6 +10067,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1) {
if (CM.getRTCheckStyle(*TTI) == RTCheckStyle::UseSafeEltsMask)
LoopsAliasMasked++;
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);

// Bail out early if either the SCEV or memory runtime checks are known to
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// during unrolling.
ExtractPenultimateElement,
LogicalAnd, // Non-poison propagating logical And.
PopCount,
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return Type::getVoidTy(Ctx);
case VPInstruction::PopCount:
return Type::getInt64Ty(Ctx);
default:
break;
}
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,11 +640,9 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
// including memory overlap checks block and wrapping/unit-stride checks block.
static constexpr uint32_t CheckBypassWeights[] = {1, 127};

void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
BasicBlock *CheckBlock,
void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *CondVPV,
VPBasicBlock *CheckBlockVPBB,
bool AddBranchWeights) {
VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
VPBlockBase *VectorPH = Plan.getVectorPreheader();
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
Expand Down
31 changes: 30 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
case VPInstruction::ExplicitVectorLength:
case VPInstruction::AnyOf:
case VPInstruction::Not:
case VPInstruction::PopCount:
return true;
default:
return false;
Expand Down Expand Up @@ -702,6 +703,29 @@ Value *VPInstruction::generate(VPTransformState &State) {
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
// Count the number of bits set in each lane and reduce the result to a scalar
case VPInstruction::PopCount: {
Value *Op = State.get(getOperand(0));
Type *VT = Op->getType();
Value *Cnt = Op;

// i1 vectors can just use the add reduction. Bigger elements need a ctpop
// first.
if (VT->getScalarSizeInBits() > 1)
Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt});

auto *VecVT = cast<VectorType>(VT);
// Extend to an i8 since i1 is too small to add with
if (VecVT->getElementType()->getScalarSizeInBits() < 8) {
Cnt = Builder.CreateCast(
Instruction::ZExt, Cnt,
VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount()));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't use a hard-coded i8, as this may not be sufficient to represent the number of bits from popcount.

}

Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt);
Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty());
return Cnt;
}
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
Expand Down Expand Up @@ -1214,7 +1238,7 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ComputeFindIVResult ||
getOpcode() == VPInstruction::ComputeReductionResult ||
getOpcode() == VPInstruction::AnyOf;
getOpcode() == VPInstruction::AnyOf || getOpcode() == PopCount;
}

bool VPInstruction::isSingleScalar() const {
Expand Down Expand Up @@ -1389,6 +1413,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
case VPInstruction::PopCount:
O << "popcount";
break;
case VPInstruction::ExplicitVectorLength:
O << "EXPLICIT-VECTOR-LENGTH";
break;
Expand Down Expand Up @@ -4316,7 +4343,9 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
getOperand(4)->printAsOperand(O, SlotTracker);
}
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "EMIT ";
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2385,7 +2385,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
// %Negated = Not %ALM
// branch-on-cond %Negated
//
static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
static VPSingleDefRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
Expand Down Expand Up @@ -2508,6 +2508,7 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
void VPlanTransforms::addActiveLaneMask(
VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
bool DataAndControlFlowWithoutRuntimeCheck) {

assert((!DataAndControlFlowWithoutRuntimeCheck ||
UseActiveLaneMaskForControlFlow) &&
"DataAndControlFlowWithoutRuntimeCheck implies "
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ struct VPlanTransforms {
/// flat CFG into a hierarchical CFG.
LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan);

/// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
/// VPValue and connect the block to \p Plan, using the VPValue as branch
/// Connect \p CheckBlockVPBB to \p Plan, using the \p CondVPV as branch
/// condition.
static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock,
static void attachCheckBlock(VPlan &Plan, VPValue *CondVPV,
VPBasicBlock *CheckBlockVPBB,
bool AddBranchWeights);

/// Replaces the VPInstructions in \p Plan with corresponding
Expand Down
Loading
Loading