@@ -3974,6 +3974,152 @@ void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
39743974 }
39753975}
39763976
3977+ // Returns the intersection of metadata from a group of loads.
3978+ static VPIRMetadata getCommonLoadMetadata (ArrayRef<VPReplicateRecipe *> Loads) {
3979+ VPIRMetadata CommonMetadata = *Loads.front ();
3980+ for (VPReplicateRecipe *Load : drop_begin (Loads))
3981+ CommonMetadata.intersect (*Load);
3982+ return CommonMetadata;
3983+ }
3984+
3985+ // Check if a load can be hoisted by verifying it doesn't alias with any stores
3986+ // in blocks between FirstBB and LastBB using scoped noalias metadata.
3987+ static bool canHoistLoadWithNoAliasCheck (VPReplicateRecipe *Load,
3988+ VPBasicBlock *FirstBB,
3989+ VPBasicBlock *LastBB) {
3990+ // Get the load's memory location and check if it aliases with any stores
3991+ // using scoped noalias metadata.
3992+ auto LoadLoc = vputils::getMemoryLocation (*Load);
3993+ if (!LoadLoc || !LoadLoc->AATags .Scope )
3994+ return false ;
3995+
3996+ const AAMDNodes &LoadAA = LoadLoc->AATags ;
3997+ for (VPBlockBase *Block = FirstBB; Block;
3998+ Block = Block->getSingleSuccessor ()) {
3999+ // This function assumes a simple linear chain of blocks. If there are
4000+ // multiple successors, we would need more complex analysis.
4001+ assert (Block->getNumSuccessors () <= 1 &&
4002+ " Expected at most one successor in block chain" );
4003+ auto *VPBB = cast<VPBasicBlock>(Block);
4004+ for (VPRecipeBase &R : *VPBB) {
4005+ if (R.mayWriteToMemory ()) {
4006+ auto Loc = vputils::getMemoryLocation (R);
4007+ // Bail out if we can't get the location or if the scoped noalias
4008+ // metadata indicates potential aliasing.
4009+ if (!Loc || ScopedNoAliasAAResult::mayAliasInScopes (
4010+ LoadAA.Scope , Loc->AATags .NoAlias ))
4011+ return false ;
4012+ }
4013+ }
4014+
4015+ if (Block == LastBB)
4016+ break ;
4017+ }
4018+ return true ;
4019+ }
4020+
4021+ void VPlanTransforms::hoistPredicatedLoads (VPlan &Plan, ScalarEvolution &SE,
4022+ const Loop *L) {
4023+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion ();
4024+ VPTypeAnalysis TypeInfo (Plan);
4025+ VPDominatorTree VPDT (Plan);
4026+
4027+ // Group predicated loads by their address SCEV.
4028+ MapVector<const SCEV *, SmallVector<VPReplicateRecipe *>> LoadsByAddress;
4029+ for (VPBlockBase *Block : vp_depth_first_shallow (LoopRegion->getEntry ())) {
4030+ auto *VPBB = cast<VPBasicBlock>(Block);
4031+ for (VPRecipeBase &R : *VPBB) {
4032+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4033+ if (!RepR || RepR->getOpcode () != Instruction::Load ||
4034+ !RepR->isPredicated ())
4035+ continue ;
4036+
4037+ VPValue *Addr = RepR->getOperand (0 );
4038+ const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue (Addr, SE, L);
4039+ if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4040+ LoadsByAddress[AddrSCEV].push_back (RepR);
4041+ }
4042+ }
4043+
4044+ // For each address, collect loads with complementary masks, sort by
4045+ // dominance, and use the earliest load.
4046+ for (auto &[Addr, Loads] : LoadsByAddress) {
4047+ if (Loads.size () < 2 )
4048+ continue ;
4049+
4050+ // Collect groups of loads with complementary masks.
4051+ SmallVector<SmallVector<VPReplicateRecipe *, 4 >> LoadGroups;
4052+ for (VPReplicateRecipe *&LoadI : Loads) {
4053+ if (!LoadI)
4054+ continue ;
4055+
4056+ VPValue *MaskI = LoadI->getMask ();
4057+ Type *TypeI = TypeInfo.inferScalarType (LoadI);
4058+ SmallVector<VPReplicateRecipe *, 4 > Group;
4059+ Group.push_back (LoadI);
4060+ LoadI = nullptr ;
4061+
4062+ // Find all loads with the same type.
4063+ for (VPReplicateRecipe *&LoadJ : Loads) {
4064+ if (!LoadJ)
4065+ continue ;
4066+
4067+ Type *TypeJ = TypeInfo.inferScalarType (LoadJ);
4068+ if (TypeI == TypeJ) {
4069+ Group.push_back (LoadJ);
4070+ LoadJ = nullptr ;
4071+ }
4072+ }
4073+
4074+ // Check if any load in the group has a complementary mask with another,
4075+ // that is M1 == NOT(M2) or M2 == NOT(M1).
4076+ bool HasComplementaryMask =
4077+ any_of (drop_begin (Group), [MaskI](VPReplicateRecipe *Load) {
4078+ VPValue *MaskJ = Load->getMask ();
4079+ return match (MaskI, m_Not (m_Specific (MaskJ))) ||
4080+ match (MaskJ, m_Not (m_Specific (MaskI)));
4081+ });
4082+
4083+ if (HasComplementaryMask)
4084+ LoadGroups.push_back (std::move (Group));
4085+ }
4086+
4087+ // For each group, check memory dependencies and hoist the earliest load.
4088+ for (auto &Group : LoadGroups) {
4089+ // Sort loads by dominance order, with earliest (most dominating) first.
4090+ sort (Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4091+ return VPDT.properlyDominates (A, B);
4092+ });
4093+
4094+ VPReplicateRecipe *EarliestLoad = Group.front ();
4095+ VPBasicBlock *FirstBB = EarliestLoad->getParent ();
4096+ VPBasicBlock *LastBB = Group.back ()->getParent ();
4097+
4098+ // Check that the load doesn't alias with stores between first and last.
4099+ if (!canHoistLoadWithNoAliasCheck (EarliestLoad, FirstBB, LastBB))
4100+ continue ;
4101+
4102+ // Collect common metadata from all loads in the group.
4103+ VPIRMetadata CommonMetadata = getCommonLoadMetadata (Group);
4104+
4105+ // Create an unpredicated version of the earliest load with common
4106+ // metadata.
4107+ auto *UnpredicatedLoad = new VPReplicateRecipe (
4108+ EarliestLoad->getUnderlyingInstr (), {EarliestLoad->getOperand (0 )},
4109+ /* IsSingleScalar=*/ false , /* Mask=*/ nullptr , /* Flags=*/ {},
4110+ CommonMetadata);
4111+
4112+ UnpredicatedLoad->insertBefore (EarliestLoad);
4113+
4114+ // Replace all loads in the group with the unpredicated load.
4115+ for (VPReplicateRecipe *Load : Group) {
4116+ Load->replaceAllUsesWith (UnpredicatedLoad);
4117+ Load->eraseFromParent ();
4118+ }
4119+ }
4120+ }
4121+ }
4122+
39774123void VPlanTransforms::materializeConstantVectorTripCount (
39784124 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
39794125 PredicatedScalarEvolution &PSE) {
0 commit comments