-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LV][POC] Use umin to avoid second-to-last iteration problems with EVL #143434
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2166,9 +2166,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |
| VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); | ||
| VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); | ||
|
|
||
| // Create a scalar phi to track the previous EVL if fixed-order recurrence is | ||
| // contained. | ||
| VPInstruction *PrevEVL = nullptr; | ||
| VPValue *PrevEVL = nullptr; | ||
| bool ContainsFORs = | ||
| any_of(Header->phis(), IsaPred<VPFirstOrderRecurrencePHIRecipe>); | ||
| if (ContainsFORs) { | ||
|
|
@@ -2183,8 +2181,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | |
| VFSize > 32 ? Instruction::Trunc : Instruction::ZExt, MaxEVL, | ||
| Type::getInt32Ty(Ctx), DebugLoc()); | ||
| } | ||
| Builder.setInsertPoint(Header, Header->getFirstNonPhi()); | ||
| PrevEVL = Builder.createScalarPhi({MaxEVL, &EVL}, DebugLoc(), "prev.evl"); | ||
| PrevEVL = MaxEVL; | ||
| } | ||
|
|
||
| for (VPUser *U : to_vector(Plan.getVF().users())) { | ||
|
|
@@ -2273,55 +2270,34 @@ bool VPlanTransforms::tryAddExplicitVectorLength( | |
| // The transform updates all users of inductions to work based on EVL, instead | ||
| // of the VF directly. At the moment, widened inductions cannot be updated, so | ||
| // bail out if the plan contains any. | ||
| bool ContainsWidenInductions = any_of( | ||
| Header->phis(), | ||
| IsaPred<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>); | ||
| if (ContainsWidenInductions) | ||
| return false; | ||
|
|
||
| auto *CanonicalIVPHI = Plan.getCanonicalIV(); | ||
| VPValue *StartV = CanonicalIVPHI->getStartValue(); | ||
|
|
||
| // Create the ExplicitVectorLengthPhi recipe in the main loop. | ||
| auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); | ||
| EVLPhi->insertAfter(CanonicalIVPHI); | ||
| VPBuilder Builder(Header, Header->getFirstNonPhi()); | ||
| // Compute original TC - IV as the AVL (application vector length). | ||
| VPValue *AVL = Builder.createNaryOp( | ||
| Instruction::Sub, {Plan.getTripCount(), EVLPhi}, DebugLoc(), "avl"); | ||
| Instruction::Sub, {Plan.getTripCount(), CanonicalIVPHI}, DebugLoc(), "avl"); | ||
| if (MaxSafeElements) { | ||
| // Support for MaxSafeDist for correct loop emission. | ||
| VPValue *AVLSafe = Plan.getOrAddLiveIn( | ||
| ConstantInt::get(CanonicalIVPHI->getScalarType(), *MaxSafeElements)); | ||
| VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe); | ||
| AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc(), "safe_avl"); | ||
| } | ||
| auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL, | ||
| DebugLoc()); | ||
|
|
||
| auto *CanonicalIVIncrement = | ||
| cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); | ||
| Builder.setInsertPoint(CanonicalIVIncrement); | ||
| VPSingleDefRecipe *OpVPEVL = VPEVL; | ||
| if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); | ||
| IVSize != 32) { | ||
| OpVPEVL = Builder.createScalarCast( | ||
| IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, | ||
| CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc()); | ||
| } | ||
| auto *NextEVLIV = Builder.createOverflowingOp( | ||
| Instruction::Add, {OpVPEVL, EVLPhi}, | ||
| {CanonicalIVIncrement->hasNoUnsignedWrap(), | ||
| CanonicalIVIncrement->hasNoSignedWrap()}, | ||
| CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); | ||
| EVLPhi->addOperand(NextEVLIV); | ||
| // This is just a umin pattern | ||
| VPValue &VFxUF = Plan.getVFxUF(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC we currently restrict UF to 1 for EVL tail folding. Do you have any data if this works if we remove the restriction?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe so, though I'm honestly missing why the the current strategy can't do it too. You just need to compute a separate remaining iteration count (and thus EVL) for each unrolled iteration (since all but the first could be zero). I have not looked into how the code structure would look here. The key bit is that the VP intrinsics do claim to support the possibly zero EVL argument. We might have some bugs to flesh out there (possibly) since that codepath isn't being tested. |
||
| VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, &VFxUF); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't we use the umin intrinsic?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's because VPBuilder currently doesn't provide a way to create min/max intrinsics, and VPInstruction doesn't have a recipe for min/max yet either. |
||
| auto *VPEVL = Builder.createSelect(Cmp, AVL, &VFxUF, DebugLoc()); | ||
|
|
||
| unsigned BitWidth = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); | ||
| LLVMContext &Ctx = CanonicalIVPHI->getScalarType()->getContext(); | ||
| VPEVL = Builder.createScalarCast( | ||
| BitWidth > 32 ? Instruction::Trunc : Instruction::ZExt, VPEVL, | ||
| Type::getInt32Ty(Ctx), DebugLoc()); | ||
|
|
||
| transformRecipestoEVLRecipes(Plan, *VPEVL); | ||
|
|
||
| // Replace all uses of VPCanonicalIVPHIRecipe by | ||
| // VPEVLBasedIVPHIRecipe except for the canonical IV increment. | ||
| CanonicalIVPHI->replaceAllUsesWith(EVLPhi); | ||
| CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); | ||
| // TODO: support unroll factor > 1. | ||
| Plan.setUF(1); | ||
| return true; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code in
if (ContainsFOR)andvp.splicecan be removed if VLOPT works well.