|
24 | 24 | #include "llvm/ADT/ScopeExit.h" |
25 | 25 | #include "llvm/Analysis/IVDescriptors.h" |
26 | 26 | #include "llvm/IR/Intrinsics.h" |
| 27 | +#include "llvm/Support/Debug.h" |
| 28 | + |
| 29 | +#define DEBUG_TYPE "vplan" |
27 | 30 |
|
28 | 31 | using namespace llvm; |
29 | 32 | using namespace llvm::VPlanPatternMatch; |
@@ -121,6 +124,7 @@ class UnrollState { |
121 | 124 | R->setOperand(OpIdx, getValueForPart(Op, Part)); |
122 | 125 | } |
123 | 126 | }; |
| 127 | + |
124 | 128 | } // namespace |
125 | 129 |
|
126 | 130 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
@@ -634,3 +638,177 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
634 | 638 | for (auto *R : reverse(ToRemove)) |
635 | 639 | R->eraseFromParent(); |
636 | 640 | } |
| 641 | + |
| 642 | +/// Process recipes in a single lane's blocks, updating them for lane-specific |
| 643 | +/// operations. |
| 644 | +static void processLane(VPlan &Plan, Type *IdxTy, unsigned Lane, |
| 645 | + ElementCount VF, ArrayRef<VPBlockBase *> RegionBlocks, |
| 646 | + DenseMap<VPBlockBase *, VPBlockBase *> &Old2NewBlocks) { |
| 647 | + DenseMap<VPValue *, VPValue *> Old2NewVPValues; |
| 648 | + for (VPBlockBase *OldVPB : RegionBlocks) { |
| 649 | + auto *OldBB = cast<VPBasicBlock>(OldVPB); |
| 650 | + auto *NewBB = cast<VPBasicBlock>(Old2NewBlocks.lookup(OldVPB)); |
| 651 | + for (const auto &[OldR, NewR] : zip(*OldBB, *NewBB)) { |
| 652 | + for (const auto &[OldV, NewV] : |
| 653 | + zip(OldR.definedValues(), NewR.definedValues())) |
| 654 | + Old2NewVPValues[OldV] = NewV; |
| 655 | + } |
| 656 | + |
| 657 | + // Update lane operands and remap operands to use copies for current lane. |
| 658 | + for (VPRecipeBase &NewR : make_early_inc_range(*NewBB)) { |
| 659 | + if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) |
| 660 | + Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane)); |
| 661 | + else if (match(&NewR, m_ExtractElement(m_VPValue(), m_ZeroInt()))) |
| 662 | + NewR.setOperand(1, Plan.getConstantInt(IdxTy, Lane)); |
| 663 | + |
| 664 | + // Remap operands to use lane-specific values. |
| 665 | + for (const auto &[I, Op] : enumerate(NewR.operands())) { |
| 666 | + // Use cloned value if operand was defined in the region. |
| 667 | + if (auto *New = Old2NewVPValues.lookup(Op)) |
| 668 | + NewR.setOperand(I, New); |
| 669 | + } |
| 670 | + } |
| 671 | + } |
| 672 | +} |
| 673 | + |
| 674 | +/// Process a single lane: clone blocks (or reuse original for lane 0), collect |
| 675 | +/// value mappings, and process recipes for lane-specific operations. |
| 676 | +static void processSingleLane( |
| 677 | + VPlan &Plan, Type *IdxTy, unsigned Lane, ElementCount VF, |
| 678 | + ArrayRef<VPBlockBase *> RegionBlocks, VPBlockBase *Entry, |
| 679 | + VPBlockBase *Exiting, |
| 680 | + SmallVectorImpl<std::pair<VPBlockBase *, VPBlockBase *>> &LaneClones) { |
| 681 | + DenseMap<VPBlockBase *, VPBlockBase *> Old2NewBlocks; |
| 682 | + if (Lane == 0) { |
| 683 | + // Lane 0 uses the original blocks, and the recipes are adjusted: |
| 684 | + // VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is |
| 685 | + // converted into BranchOnCond and extracts are created as needed. |
| 686 | + for (VPBlockBase *VPB : RegionBlocks) { |
| 687 | + Old2NewBlocks[VPB] = VPB; |
| 688 | + |
| 689 | + for (VPRecipeBase &NewR : |
| 690 | + make_early_inc_range(*cast<VPBasicBlock>(VPB))) { |
| 691 | + VPBuilder Builder(&NewR); |
| 692 | + for (const auto &[I, Op] : enumerate(NewR.operands())) { |
| 693 | + // Skip operands that don't need extraction: scalar VF (no vectors), |
| 694 | + // values defined in the same block (already scalar), or values that |
| 695 | + // are already single scalars. |
| 696 | + if (VF.isScalar() || |
| 697 | + (Op->getDefiningRecipe() && |
| 698 | + Op->getDefiningRecipe()->getParent() == VPB) || |
| 699 | + vputils::isSingleScalar(Op)) |
| 700 | + continue; |
| 701 | + |
| 702 | + // Extract the lane from values defined outside the region. |
| 703 | + VPValue *Idx = Plan.getConstantInt(IdxTy, Lane); |
| 704 | + VPValue *Extract = Builder.createNaryOp( |
| 705 | + Instruction::ExtractElement, {Op, Idx}, NewR.getDebugLoc()); |
| 706 | + NewR.setOperand(I, Extract); |
| 707 | + } |
| 708 | + |
| 709 | + if (auto *RepR = dyn_cast<VPReplicateRecipe>(&NewR)) { |
| 710 | + auto *New = new VPReplicateRecipe( |
| 711 | + RepR->getUnderlyingInstr(), RepR->operands(), |
| 712 | + /* IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR, *RepR, |
| 713 | + RepR->getDebugLoc()); |
| 714 | + New->insertBefore(RepR); |
| 715 | + RepR->replaceAllUsesWith(New); |
| 716 | + RepR->eraseFromParent(); |
| 717 | + } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(&NewR)) { |
| 718 | + Builder.createNaryOp(VPInstruction::BranchOnCond, |
| 719 | + {BranchOnMask->getOperand(0)}, |
| 720 | + BranchOnMask->getDebugLoc()); |
| 721 | + BranchOnMask->eraseFromParent(); |
| 722 | + } else if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(&NewR)) { |
| 723 | + // Add lane operand (4th operand) for VPScalarIVStepsRecipe if not |
| 724 | + // already present. |
| 725 | + unsigned NumOps = Steps->getNumOperands(); |
| 726 | + if (NumOps == 4) { |
| 727 | + // Has UnrollPart at position 3, need to insert Lane before it. |
| 728 | + VPValue *UnrollPart = Steps->getOperand(3); |
| 729 | + Steps->setOperand(3, Plan.getConstantInt(IdxTy, Lane)); |
| 730 | + Steps->addOperand(UnrollPart); |
| 731 | + } else if (NumOps == 3) { |
| 732 | + // Just BaseIV, Step, VF - add Lane. |
| 733 | + Steps->addOperand(Plan.getConstantInt(IdxTy, Lane)); |
| 734 | + Steps->addOperand(Plan.getConstantInt(IdxTy, 0)); |
| 735 | + } |
| 736 | + } |
| 737 | + } |
| 738 | + } |
| 739 | + } else { |
| 740 | + // Clone blocks and connect them according to original structure. |
| 741 | + for (VPBlockBase *OrigBlock : RegionBlocks) { |
| 742 | + VPBlockBase *ClonedBlock = OrigBlock->clone(); |
| 743 | + Old2NewBlocks[OrigBlock] = ClonedBlock; |
| 744 | + ClonedBlock->setParent(Entry->getParent()); |
| 745 | + } |
| 746 | + for (VPBlockBase *OrigBlock : RegionBlocks) { |
| 747 | + if (OrigBlock == Exiting) |
| 748 | + continue; |
| 749 | + for (VPBlockBase *OrigSucc : OrigBlock->successors()) |
| 750 | + VPBlockUtils::connectBlocks(Old2NewBlocks[OrigBlock], |
| 751 | + Old2NewBlocks[OrigSucc]); |
| 752 | + } |
| 753 | + } |
| 754 | + |
| 755 | + processLane(Plan, IdxTy, Lane, VF, RegionBlocks, Old2NewBlocks); |
| 756 | + LaneClones.push_back({Old2NewBlocks[Entry], Old2NewBlocks[Exiting]}); |
| 757 | +} |
| 758 | + |
| 759 | +void VPlanTransforms::unrollReplicateRegions(VPlan &Plan, ElementCount VF) { |
| 760 | + // Collect all replicate regions in the plan before modifying the CFG. |
| 761 | + SmallVector<VPRegionBlock *> ReplicateRegions; |
| 762 | + for (VPBlockBase *Block : |
| 763 | + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) { |
| 764 | + if (auto *Region = dyn_cast<VPRegionBlock>(Block)) { |
| 765 | + if (Region->isReplicator()) |
| 766 | + ReplicateRegions.push_back(Region); |
| 767 | + } |
| 768 | + } |
| 769 | + |
| 770 | + Type *IdxTy = IntegerType::get(Plan.getContext(), 32); |
| 771 | + |
| 772 | + for (VPRegionBlock *Region : ReplicateRegions) { |
| 773 | + assert(!VF.isScalable() && "cannot replicate across scalable VFs"); |
| 774 | + |
| 775 | + VPBlockBase *Entry = Region->getEntry(); |
| 776 | + VPBlockBase *Exiting = Region->getExiting(); |
| 777 | + |
| 778 | + // Skip regions with live-outs as packing scalar results back into vectors |
| 779 | + // is not yet implemented. |
| 780 | + if (any_of(*cast<VPBasicBlock>(Exiting), IsaPred<VPPredInstPHIRecipe>)) |
| 781 | + continue; |
| 782 | + |
| 783 | + // Get region context before dissolving. |
| 784 | + VPBlockBase *Pred = Region->getSinglePredecessor(); |
| 785 | + assert(Pred && "Replicate region must have a single predecessor"); |
| 786 | + SmallVector<VPBlockBase *> Successors(Region->successors()); |
| 787 | + |
| 788 | + // Disconnect and dissolve the region. |
| 789 | + VPBlockUtils::disconnectBlocks(Pred, Region); |
| 790 | + for (VPBlockBase *Succ : Successors) |
| 791 | + VPBlockUtils::disconnectBlocks(Region, Succ); |
| 792 | + |
| 793 | + SmallVector<VPBlockBase *> RegionBlocks(vp_depth_first_shallow(Entry)); |
| 794 | + VPRegionBlock *ParentRegion = Region->getParent(); |
| 795 | + for (VPBlockBase *Block : RegionBlocks) |
| 796 | + Block->setParent(ParentRegion); |
| 797 | + VPBlockUtils::connectBlocks(Pred, Entry); |
| 798 | + |
| 799 | + // Process each lane: clone blocks, collect value mappings, and process |
| 800 | + // recipes for lane-specific operations. |
| 801 | + SmallVector<std::pair<VPBlockBase *, VPBlockBase *>> LaneClones; |
| 802 | + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) { |
| 803 | + processSingleLane(Plan, IdxTy, Lane, VF, RegionBlocks, Entry, Exiting, |
| 804 | + LaneClones); |
| 805 | + } |
| 806 | + |
| 807 | + // Connect lanes sequentially and connect last lane to successors. |
| 808 | + for (unsigned Lane = 1; Lane < VF.getKnownMinValue(); ++Lane) |
| 809 | + VPBlockUtils::connectBlocks(LaneClones[Lane - 1].second, |
| 810 | + LaneClones[Lane].first); |
| 811 | + for (VPBlockBase *Succ : Successors) |
| 812 | + VPBlockUtils::connectBlocks(LaneClones.back().second, Succ); |
| 813 | + } |
| 814 | +} |
0 commit comments