-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[LV] Mask off possibly aliasing vector lanes #100579
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/SamTebbs33/loop-dependence-costmodel
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -169,6 +169,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized"); | |
| STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); | ||
| STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); | ||
| STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized"); | ||
| STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask"); | ||
|
|
||
| static cl::opt<bool> EnableEpilogueVectorization( | ||
| "enable-epilogue-vectorization", cl::init(true), cl::Hidden, | ||
|
|
@@ -1333,6 +1334,12 @@ class LoopVectorizationCostModel { | |
| : ChosenTailFoldingStyle->second; | ||
| } | ||
|
|
||
| RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const { | ||
| if (TTI.useSafeEltsMask()) | ||
| return RTCheckStyle::UseSafeEltsMask; | ||
| return RTCheckStyle::ScalarDifference; | ||
| } | ||
|
|
||
| /// Selects and saves TailFoldingStyle for 2 options - if IV update may | ||
| /// overflow or not. | ||
| /// \param IsScalableVF true if scalable vector factors enabled. | ||
|
|
@@ -8554,6 +8561,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( | |
| bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); | ||
| bool WithoutRuntimeCheck = | ||
| Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; | ||
|
|
||
| VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, | ||
| WithoutRuntimeCheck); | ||
| } | ||
|
|
@@ -8974,11 +8982,104 @@ void LoopVectorizationPlanner::attachRuntimeChecks( | |
| assert((!CM.OptForSize || | ||
| CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && | ||
| "Cannot SCEV check stride or overflow when optimizing for size"); | ||
| VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock, | ||
| VPlanTransforms::attachCheckBlock(Plan, Plan.getOrAddLiveIn(SCEVCheckCond), | ||
| Plan.createVPIRBasicBlock(SCEVCheckBlock), | ||
| HasBranchWeights); | ||
| } | ||
| const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks(); | ||
| if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { | ||
| VPValue *MemCheckCondVPV = Plan.getOrAddLiveIn(MemCheckCond); | ||
| VPBasicBlock *MemCheckBlockVP = Plan.createVPIRBasicBlock(MemCheckBlock); | ||
| std::optional<ArrayRef<PointerDiffInfo>> ChecksOpt = | ||
| CM.Legal->getRuntimePointerChecking()->getDiffChecks(); | ||
|
|
||
| // Create a mask enabling safe elements for each iteration. | ||
| if (CM.getRTCheckStyle(TTI) == RTCheckStyle::UseSafeEltsMask && | ||
| ChecksOpt.has_value() && ChecksOpt->size() > 0) { | ||
| ArrayRef<PointerDiffInfo> Checks = *ChecksOpt; | ||
| VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); | ||
| VPBasicBlock *LoopBody = LoopRegion->getEntryBasicBlock(); | ||
| VPBuilder Builder(MemCheckBlockVP); | ||
|
|
||
| /// Create a mask for each possibly-aliasing pointer pair, ANDing them if | ||
| /// there's more than one pair. | ||
| VPValue *AliasMask = nullptr; | ||
| for (PointerDiffInfo Check : Checks) { | ||
| VPValue *Sink = | ||
| vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart); | ||
| VPValue *Src = | ||
| vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart); | ||
|
|
||
| Type *PtrType = PointerType::getUnqual(Plan.getContext()); | ||
| Sink = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, | ||
| PtrType, DebugLoc()); | ||
| Src = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, | ||
| PtrType, DebugLoc()); | ||
|
|
||
| SmallVector<VPValue *, 3> Ops{ | ||
| Src, Sink, | ||
| Plan.getConstantInt(IntegerType::getInt64Ty(Plan.getContext()), | ||
| Check.AccessSize)}; | ||
| VPWidenIntrinsicRecipe *M = new VPWidenIntrinsicRecipe( | ||
| Check.WriteAfterRead ? Intrinsic::loop_dependence_war_mask | ||
| : Intrinsic::loop_dependence_raw_mask, | ||
| Ops, IntegerType::getInt1Ty(Plan.getContext())); | ||
| MemCheckBlockVP->appendRecipe(M); | ||
| if (AliasMask) | ||
| AliasMask = Builder.createAnd(AliasMask, M); | ||
| else | ||
| AliasMask = M; | ||
| } | ||
| assert(AliasMask && "Expected an alias mask to have been created"); | ||
|
|
||
| // Replace uses of the loop body's active lane mask phi with an AND of the | ||
| // phi and the alias mask. | ||
| for (VPRecipeBase &R : *LoopBody) { | ||
| auto *MaskPhi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe the transform is currently incorrect. When there is no active lane mask, it would create an unpredicated vector loop that handles e.g. VF=16 lanes in a loop, even when the result of the alias.mask would say that only 3 lanes could be safely handled, for example. It would then increment the IV by 3 elements, but that doesn't mean only 3 lanes are handled each iteration. Without predication, it still handles 16 lanes each iteration. I think there are two options here:
I wouldn't mind taking approach 1 first, so that we can already use the whilerw instructions for the alias checks in the check block, rather than a bunch of scalar instructions, and then follow this up by option 2.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we necessarily need an active-lane-mask, as long as either all recipes that need predication (memory ops, ops that are immediate UB on poison, reduction/recurrences) are already predicated (could be due to tail-folding without active-lane-mask) or we could convert them to predicated variants using the alias mask. Also, an active-lane-mask also does not necessarily mean all required recipes are predicated and use the active-lane-mask (e.g. a transform may convert a masked memory access to an unmasked one, if it is guaranteed dereferneceable for the whole loop). So would probably be good to check if all required recipes are masked and make sure their masks inlcude AliasMask after the transform |
||
| if (!MaskPhi) | ||
| continue; | ||
| VPInstruction *And = new VPInstruction(Instruction::BinaryOps::And, | ||
| {MaskPhi, AliasMask}); | ||
| MaskPhi->replaceUsesWithIf(And, [And](VPUser &U, unsigned) { | ||
| auto *UR = dyn_cast<VPRecipeBase>(&U); | ||
| // If this is the first user, insert the AND. | ||
| if (UR && !And->getParent()) | ||
| And->insertBefore(UR); | ||
| bool Replace = UR != And; | ||
| return Replace; | ||
| }); | ||
| } | ||
|
|
||
| // An empty mask would cause an infinite loop since the induction variable | ||
| // is updated with the number of set elements in the mask. Make sure we | ||
| // don't execute the vector loop when the mask is empty. | ||
| VPInstruction *PopCount = | ||
| new VPInstruction(VPInstruction::PopCount, {AliasMask}); | ||
| PopCount->insertAfter(AliasMask->getDefiningRecipe()); | ||
| VPValue *Cmp = | ||
| Builder.createICmp(CmpInst::Predicate::ICMP_EQ, PopCount, | ||
| Plan.getOrAddLiveIn(ConstantInt::get( | ||
| IntegerType::get(Plan.getContext(), 64), 0))); | ||
| MemCheckCondVPV = Cmp; | ||
|
|
||
| // Update the IV by the number of active lanes in the mask. | ||
| auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); | ||
| auto *CanonicalIVIncrement = | ||
| cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); | ||
|
|
||
| // Increment phi by correct amount. | ||
| VPValue *IncrementBy = PopCount; | ||
| Type *IVType = CanonicalIVPHI->getScalarType(); | ||
|
|
||
| if (IVType->getScalarSizeInBits() < 64) { | ||
| Builder.setInsertPoint(CanonicalIVIncrement); | ||
| IncrementBy = | ||
| Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType, | ||
| CanonicalIVIncrement->getDebugLoc()); | ||
| } | ||
| CanonicalIVIncrement->setOperand(1, IncrementBy); | ||
| } | ||
|
|
||
| // VPlan-native path does not do any analysis for runtime checks | ||
| // currently. | ||
| assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && | ||
|
|
@@ -8999,7 +9100,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( | |
| "(e.g., adding 'restrict')."; | ||
| }); | ||
| } | ||
| VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock, | ||
| VPlanTransforms::attachCheckBlock(Plan, MemCheckCondVPV, MemCheckBlockVP, | ||
| HasBranchWeights); | ||
| } | ||
| } | ||
|
|
@@ -9966,6 +10067,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
| // Optimistically generate runtime checks if they are needed. Drop them if | ||
| // they turn out to not be profitable. | ||
| if (VF.Width.isVector() || SelectedIC > 1) { | ||
| if (CM.getRTCheckStyle(*TTI) == RTCheckStyle::UseSafeEltsMask) | ||
| LoopsAliasMasked++; | ||
| Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); | ||
|
|
||
| // Bail out early if either the SCEV or memory runtime checks are known to | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -602,6 +602,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { | |
| case VPInstruction::ExplicitVectorLength: | ||
| case VPInstruction::AnyOf: | ||
| case VPInstruction::Not: | ||
| case VPInstruction::PopCount: | ||
| return true; | ||
| default: | ||
| return false; | ||
|
|
@@ -702,6 +703,29 @@ Value *VPInstruction::generate(VPTransformState &State) { | |
| {PredTy, ScalarTC->getType()}, | ||
| {VIVElem0, ScalarTC}, nullptr, Name); | ||
| } | ||
| // Count the number of bits set in each lane and reduce the result to a scalar | ||
| case VPInstruction::PopCount: { | ||
| Value *Op = State.get(getOperand(0)); | ||
| Type *VT = Op->getType(); | ||
| Value *Cnt = Op; | ||
|
|
||
| // i1 vectors can just use the add reduction. Bigger elements need a ctpop | ||
| // first. | ||
| if (VT->getScalarSizeInBits() > 1) | ||
| Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt}); | ||
|
|
||
| auto *VecVT = cast<VectorType>(VT); | ||
| // Extend to an i8 since i1 is too small to add with | ||
| if (VecVT->getElementType()->getScalarSizeInBits() < 8) { | ||
| Cnt = Builder.CreateCast( | ||
| Instruction::ZExt, Cnt, | ||
| VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount())); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't use a hard-coded i8, as this may not be sufficient to represent the number of bits from popcount. |
||
| } | ||
|
|
||
| Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt); | ||
| Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty()); | ||
| return Cnt; | ||
| } | ||
| case VPInstruction::FirstOrderRecurrenceSplice: { | ||
| // Generate code to combine the previous and current values in vector v3. | ||
| // | ||
|
|
@@ -1214,7 +1238,7 @@ bool VPInstruction::isVectorToScalar() const { | |
| getOpcode() == VPInstruction::ComputeAnyOfResult || | ||
| getOpcode() == VPInstruction::ComputeFindIVResult || | ||
| getOpcode() == VPInstruction::ComputeReductionResult || | ||
| getOpcode() == VPInstruction::AnyOf; | ||
| getOpcode() == VPInstruction::AnyOf || getOpcode() == PopCount; | ||
| } | ||
|
|
||
| bool VPInstruction::isSingleScalar() const { | ||
|
|
@@ -1389,6 +1413,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, | |
| case VPInstruction::ActiveLaneMask: | ||
| O << "active lane mask"; | ||
| break; | ||
| case VPInstruction::PopCount: | ||
| O << "popcount"; | ||
| break; | ||
| case VPInstruction::ExplicitVectorLength: | ||
| O << "EXPLICIT-VECTOR-LENGTH"; | ||
| break; | ||
|
|
@@ -4316,7 +4343,9 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, | |
| getOperand(4)->printAsOperand(O, SlotTracker); | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | ||
| void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, | ||
| VPSlotTracker &SlotTracker) const { | ||
| O << Indent << "EMIT "; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be good to outline to a separte function + document the transform