Skip to content

Commit e6af9b4

Browse files
committed
[VPlan] Explicitly unoll replicate-regions without live-outs by VF.
This patch adds a new replicateReplicateRegionsByVF transform to unroll replicate=regions by VF, dissolving them. The transform creates VF copies of the replicate-region's content, connects them and converts recipes to single-scalar variants for the corresponding lanes. The initial version skips regions with live-outs (VPPredInstPHIRecipe), which will be added in follow-up patches. Depends on llvm#170053
1 parent 69a200e commit e6af9b4

File tree

59 files changed

+688
-764
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+688
-764
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7306,6 +7306,10 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73067306
BestVPlan);
73077307
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
73087308
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7309+
VPlanTransforms::runPass(VPlanTransforms::unrollReplicateRegions, BestVPlan,
7310+
BestVF);
7311+
VPlanTransforms::runPass(VPlanTransforms::mergeBlocksIntoPredecessors,
7312+
BestVPlan);
73097313
bool HasBranchWeights =
73107314
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
73117315
if (HasBranchWeights) {

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3816,8 +3816,9 @@ class LLVM_ABI_FOR_TEST VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
38163816
getOperand(0), getOperand(1), getOperand(2), InductionOpcode,
38173817
hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(),
38183818
getDebugLoc());
3819-
if (getNumOperands() == 4)
3820-
NewR->addOperand(getOperand(3));
3819+
// Add lane/unroll-part operands, if present.
3820+
for (VPValue *Op : drop_begin(operands(), 3))
3821+
NewR->addOperand(Op);
38213822
return NewR;
38223823
}
38233824

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ static void addReplicateRegions(VPlan &Plan) {
481481

482482
/// Remove redundant VPBasicBlocks by merging them into their predecessor if
483483
/// the predecessor has a single successor.
484-
static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
484+
bool VPlanTransforms::mergeBlocksIntoPredecessors(VPlan &Plan) {
485485
SmallVector<VPBasicBlock *> WorkList;
486486
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
487487
vp_depth_first_deep(Plan.getEntry()))) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ struct VPlanTransforms {
178178
/// replicate regions, thereby dissolving the latter.
179179
static void replicateByVF(VPlan &Plan, ElementCount VF);
180180

181+
/// Replace replicate regions by explicitly replicating the regions' contents
182+
/// \p VF times, each copy processing a single lane.
183+
static void unrollReplicateRegions(VPlan &Plan, ElementCount VF);
184+
181185
/// Optimize \p Plan based on \p BestVF and \p BestUF. This may restrict the
182186
/// resulting plan to \p BestVF and \p BestUF.
183187
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -189,6 +193,8 @@ struct VPlanTransforms {
189193
/// block merging.
190194
LLVM_ABI_FOR_TEST static void optimize(VPlan &Plan);
191195

196+
static bool mergeBlocksIntoPredecessors(VPlan &Plan);
197+
192198
/// Wrap predicated VPReplicateRecipes with a mask operand in an if-then
193199
/// region block and remove the mask operand. Optimize the created regions by
194200
/// iteratively sinking scalar operands into the region, followed by merging

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#include "llvm/ADT/ScopeExit.h"
2525
#include "llvm/Analysis/IVDescriptors.h"
2626
#include "llvm/IR/Intrinsics.h"
27+
#include "llvm/Support/Debug.h"
28+
29+
#define DEBUG_TYPE "vplan"
2730

2831
using namespace llvm;
2932
using namespace llvm::VPlanPatternMatch;
@@ -121,6 +124,7 @@ class UnrollState {
121124
R->setOperand(OpIdx, getValueForPart(Op, Part));
122125
}
123126
};
127+
124128
} // namespace
125129

126130
void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
@@ -634,3 +638,177 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
634638
for (auto *R : reverse(ToRemove))
635639
R->eraseFromParent();
636640
}
641+
642+
/// Process recipes in a single lane's blocks, updating them for lane-specific
643+
/// operations.
644+
static void processLane(VPlan &Plan, Type *IdxTy, unsigned Lane,
645+
ElementCount VF, ArrayRef<VPBlockBase *> RegionBlocks,
646+
DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBlocks) {
647+
DenseMap<VPValue *, VPValue *> Old2NewVPValues;
648+
for (VPBlockBase *OldVPB : RegionBlocks) {
649+
auto *OldBB = cast<VPBasicBlock>(OldVPB);
650+
auto *NewBB = cast<VPBasicBlock>(Old2NewBlocks.lookup(OldVPB));
651+
for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) {
652+
for (const auto &[OldV, NewV] :
653+
zip(OldR.definedValues(), NewR.definedValues()))
654+
Old2NewVPValues[OldV] = NewV;
655+
}
656+
657+
// Update lane operands and remap operands to use copies for current lane.
658+
for (VPRecipeBase &NewR : make_early_inc_range(*NewBB)) {
659+
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR))
660+
Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane));
661+
else if (match(&NewR, m_ExtractElement(m_VPValue(), m_ZeroInt())))
662+
NewR.setOperand(1, Plan.getConstantInt(IdxTy, Lane));
663+
664+
// Remap operands to use lane-specific values.
665+
for (const auto &[I, Op] : enumerate(NewR.operands())) {
666+
// Use cloned value if operand was defined in the region.
667+
if (auto *New = Old2NewVPValues.lookup(Op))
668+
NewR.setOperand(I, New);
669+
}
670+
}
671+
}
672+
}
673+
674+
/// Process a single lane: clone blocks (or reuse original for lane 0), collect
675+
/// value mappings, and process recipes for lane-specific operations.
676+
static void processSingleLane(
677+
VPlan &Plan, Type *IdxTy, unsigned Lane, ElementCount VF,
678+
ArrayRef<VPBlockBase *> RegionBlocks, VPBlockBase *Entry,
679+
VPBlockBase *Exiting,
680+
SmallVectorImpl<std::pair<VPBlockBase *, VPBlockBase *>> &LaneClones) {
681+
DenseMap<VPBlockBase *, VPBlockBase *> Old2NewBlocks;
682+
if (Lane == 0) {
683+
// Lane 0 uses the original blocks, and the recipes are adjusted:
684+
// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
685+
// converted into BranchOnCond and extracts are created as needed.
686+
for (VPBlockBase *VPB : RegionBlocks) {
687+
Old2NewBlocks[VPB] = VPB;
688+
689+
for (VPRecipeBase &NewR :
690+
make_early_inc_range(*cast<VPBasicBlock>(VPB))) {
691+
VPBuilder Builder(&NewR);
692+
for (const auto &[I, Op] : enumerate(NewR.operands())) {
693+
// Skip operands that don't need extraction: scalar VF (no vectors),
694+
// values defined in the same block (already scalar), or values that
695+
// are already single scalars.
696+
if (VF.isScalar() ||
697+
(Op->getDefiningRecipe() &&
698+
Op->getDefiningRecipe()->getParent() == VPB) ||
699+
vputils::isSingleScalar(Op))
700+
continue;
701+
702+
// Extract the lane from values defined outside the region.
703+
VPValue *Idx = Plan.getConstantInt(IdxTy, Lane);
704+
VPValue *Extract = Builder.createNaryOp(
705+
Instruction::ExtractElement, {Op, Idx}, NewR.getDebugLoc());
706+
NewR.setOperand(I, Extract);
707+
}
708+
709+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&NewR)) {
710+
auto *New = new VPReplicateRecipe(
711+
RepR->getUnderlyingInstr(), RepR->operands(),
712+
/* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR,
713+
RepR->getDebugLoc());
714+
New->insertBefore(RepR);
715+
RepR->replaceAllUsesWith(New);
716+
RepR->eraseFromParent();
717+
} else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&NewR)) {
718+
Builder.createNaryOp(VPInstruction::BranchOnCond,
719+
{BranchOnMask->getOperand(0)},
720+
BranchOnMask->getDebugLoc());
721+
BranchOnMask->eraseFromParent();
722+
} else if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) {
723+
// Add lane operand (4th operand) for VPScalarIVStepsRecipe if not
724+
// already present.
725+
unsigned NumOps = Steps->getNumOperands();
726+
if (NumOps == 4) {
727+
// Has UnrollPart at position 3, need to insert Lane before it.
728+
VPValue *UnrollPart = Steps->getOperand(3);
729+
Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane));
730+
Steps->addOperand(UnrollPart);
731+
} else if (NumOps == 3) {
732+
// Just BaseIV, Step, VF - add Lane.
733+
Steps->addOperand(Plan.getConstantInt(IdxTy, Lane));
734+
Steps->addOperand(Plan.getConstantInt(IdxTy, 0));
735+
}
736+
}
737+
}
738+
}
739+
} else {
740+
// Clone blocks and connect them according to original structure.
741+
for (VPBlockBase *OrigBlock : RegionBlocks) {
742+
VPBlockBase *ClonedBlock = OrigBlock->clone();
743+
Old2NewBlocks[OrigBlock] = ClonedBlock;
744+
ClonedBlock->setParent(Entry->getParent());
745+
}
746+
for (VPBlockBase *OrigBlock : RegionBlocks) {
747+
if (OrigBlock == Exiting)
748+
continue;
749+
for (VPBlockBase *OrigSucc : OrigBlock->successors())
750+
VPBlockUtils::connectBlocks(Old2NewBlocks[OrigBlock],
751+
Old2NewBlocks[OrigSucc]);
752+
}
753+
}
754+
755+
processLane(Plan, IdxTy, Lane, VF, RegionBlocks, Old2NewBlocks);
756+
LaneClones.push_back({Old2NewBlocks[Entry], Old2NewBlocks[Exiting]});
757+
}
758+
759+
void VPlanTransforms::unrollReplicateRegions(VPlan &Plan, ElementCount VF) {
760+
// Collect all replicate regions in the plan before modifying the CFG.
761+
SmallVector<VPRegionBlock *> ReplicateRegions;
762+
for (VPBlockBase *Block :
763+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) {
764+
if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
765+
if (Region->isReplicator())
766+
ReplicateRegions.push_back(Region);
767+
}
768+
}
769+
770+
Type *IdxTy = IntegerType::get(Plan.getContext(), 32);
771+
772+
for (VPRegionBlock *Region : ReplicateRegions) {
773+
assert(!VF.isScalable() && "cannot replicate across scalable VFs");
774+
775+
VPBlockBase *Entry = Region->getEntry();
776+
VPBlockBase *Exiting = Region->getExiting();
777+
778+
// Skip regions with live-outs as packing scalar results back into vectors
779+
// is not yet implemented.
780+
if (any_of(*cast<VPBasicBlock>(Exiting), IsaPred<VPPredInstPHIRecipe>))
781+
continue;
782+
783+
// Get region context before dissolving.
784+
VPBlockBase *Pred = Region->getSinglePredecessor();
785+
assert(Pred && "Replicate region must have a single predecessor");
786+
SmallVector<VPBlockBase *> Successors(Region->successors());
787+
788+
// Disconnect and dissolve the region.
789+
VPBlockUtils::disconnectBlocks(Pred, Region);
790+
for (VPBlockBase *Succ : Successors)
791+
VPBlockUtils::disconnectBlocks(Region, Succ);
792+
793+
SmallVector<VPBlockBase *> RegionBlocks(vp_depth_first_shallow(Entry));
794+
VPRegionBlock *ParentRegion = Region->getParent();
795+
for (VPBlockBase *Block : RegionBlocks)
796+
Block->setParent(ParentRegion);
797+
VPBlockUtils::connectBlocks(Pred, Entry);
798+
799+
// Process each lane: clone blocks, collect value mappings, and process
800+
// recipes for lane-specific operations.
801+
SmallVector<std::pair<VPBlockBase *, VPBlockBase *>> LaneClones;
802+
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) {
803+
processSingleLane(Plan, IdxTy, Lane, VF, RegionBlocks, Entry, Exiting,
804+
LaneClones);
805+
}
806+
807+
// Connect lanes sequentially and connect last lane to successors.
808+
for (unsigned Lane = 1; Lane < VF.getKnownMinValue(); ++Lane)
809+
VPBlockUtils::connectBlocks(LaneClones[Lane - 1].second,
810+
LaneClones[Lane].first);
811+
for (VPBlockBase *Succ : Successors)
812+
VPBlockUtils::connectBlocks(LaneClones.back().second, Succ);
813+
}
814+
}

llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,7 @@ define void @test_blend_feeding_replicated_store_2(ptr noalias %src, ptr %dst, i
199199
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP7]], i32 0
200200
; CHECK-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
201201
; CHECK: [[PRED_STORE_IF]]:
202-
; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[IV]], 0
203-
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP72]]
202+
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
204203
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 0
205204
; CHECK-NEXT: store i8 [[TMP10]], ptr [[TMP9]], align 1
206205
; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]

0 commit comments

Comments
 (0)