@@ -3875,6 +3875,16 @@ template <uint64_t TargetAlign> constexpr bool matchAlignment(uint64_t Value) {
38753875 return isAligned (Align (TargetAlign), Value);
38763876}
38773877
3878+ static std::optional<std::pair<Register, int64_t >>
3879+ getPtrAndConstantOffsetFromReg (Register PtrReg, MachineRegisterInfo &MRI) {
3880+
3881+ const MachineInstr *DefPtrReg = MRI.getVRegDef (PtrReg);
3882+ if (DefPtrReg->getOpcode () == TargetOpcode::G_PTR_ADD)
3883+ return getPtrAndConstantOffset (DefPtrReg, 1 , MRI);
3884+
3885+ return std::make_pair (PtrReg, 0 );
3886+ }
3887+
38783888static bool isBasePointerWordAligned (Register BasePtr,
38793889 MachineRegisterInfo &MRI) {
38803890
@@ -3948,17 +3958,12 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
39483958 return false ;
39493959 const uint64_t Initializer = CstInit->Value .getZExtValue ();
39503960
3951- unsigned Offset = 0 ;
3952- const MachineInstr *DefPtrReg = MRI.getVRegDef (PtrReg);
3953- if (DefPtrReg->getOpcode () == TargetOpcode::G_PTR_ADD) {
3954- const auto RegAndOffset = getPtrAndConstantOffset (DefPtrReg, 1 , MRI);
3961+ const auto RegAndOffset = getPtrAndConstantOffsetFromReg (PtrReg, MRI);
3962+ if (!RegAndOffset)
3963+ return false ;
39553964
3956- if (!RegAndOffset)
3957- return false ;
3958-
3959- Offset = RegAndOffset->second ;
3960- PtrReg = RegAndOffset->first ;
3961- }
3965+ const int64_t Offset = RegAndOffset->second ;
3966+ PtrReg = RegAndOffset->first ;
39623967
39633968 // Next step is to prove that the base pointer is word-aligned.
39643969 // As we cannot assume, we can search for aligned uses of the base pointer.
@@ -4029,3 +4034,168 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
40294034
40304035 return true ;
40314036}
4037+
4038+ static std::optional<std::pair<Register, int64_t >>
4039+ getPtrAndConstantOffsetFromStore (const GStore *StMI, MachineRegisterInfo &MRI) {
4040+ return getPtrAndConstantOffsetFromReg (StMI->getPointerReg (), MRI);
4041+ }
4042+
4043+ // To make a store dead, we convert it to a dead G_ADD and let DCE to do the
4044+ // removal
4045+ static void makeStoreDead (GStore *StMI, const TargetInstrInfo &TII,
4046+ MachineRegisterInfo &MRI) {
4047+ MachineFunction &MF = *StMI->getMF ();
4048+ const Register DataReg = StMI->getValueReg ();
4049+ StMI->dropMemRefs (MF);
4050+ for (int I = StMI->getNumOperands () - 1 ; I >= 0 ; I--)
4051+ StMI->removeOperand (I);
4052+ StMI->setDesc (TII.get (TargetOpcode::G_ADD));
4053+ StMI->addOperand (
4054+ MachineOperand::CreateReg (MRI.cloneVirtualRegister (DataReg), true ));
4055+ StMI->addOperand (MachineOperand::CreateReg (DataReg, false ));
4056+ StMI->addOperand (MachineOperand::CreateReg (DataReg, false ));
4057+ }
4058+
4059+ // This combiner tries to pack sequential zero stores into memsets.
4060+ // The goal is to reach an optimal number of stores provided we
4061+ // use it synergically with the memset expand combiner.
4062+ bool llvm::matchSequentialStores (GStore &StMI, MachineRegisterInfo &MRI,
4063+ GISelChangeObserver &Observer,
4064+ BuildFnTy &MatchInfo) {
4065+
4066+ if (!MemsetOptimizations)
4067+ return false ;
4068+
4069+ const uint64_t MinVectorStoreSize = 16 ;
4070+ MachineMemOperand *MMO = StMI.memoperands ().front ();
4071+
4072+ if (!MMO)
4073+ return false ;
4074+ const Align MMOAlign = MMO->getAlign ();
4075+
4076+ if (MMOAlign.value () < 4 )
4077+ return false ;
4078+
4079+ const Register DataReg = StMI.getValueReg ();
4080+ const LLT DataType = MRI.getType (DataReg);
4081+
4082+ // Small alignments are less interesting, we are trying to match
4083+ // vector stores here. Even though we can match some byte/short
4084+ // stores to word stores.
4085+ const bool IsVectorAlignment = MMOAlign.value () >= MinVectorStoreSize;
4086+ // We can merge over aligned small types, provided that
4087+ // the root of the memset (first store) is a byte or short
4088+ // store. The goal is to fold this store with the next ones.
4089+ if (!IsVectorAlignment && DataType.getSizeInBytes () >= 4 )
4090+ return false ;
4091+
4092+ // A count of zero means that we are not storing zero at all.
4093+ auto GetZeroStoreSizeInBytes = [&](GStore &CurrSt) -> unsigned {
4094+ const Register DataReg = CurrSt.getValueReg ();
4095+ const LLT DataType = MRI.getType (DataReg);
4096+ MachineMemOperand *CurrMMO = CurrSt.memoperands ().front ();
4097+
4098+ if (!CurrMMO)
4099+ return 0 ;
4100+ const Align CurrMMOAlign = MMO->getAlign ();
4101+
4102+ // We already have a vector store, don't merge it.
4103+ if (DataType.getSizeInBytes () >= CurrMMOAlign.value ())
4104+ return 0 ;
4105+ auto Cst = getIConstantVRegValWithLookThrough (DataReg, MRI);
4106+ return (Cst && Cst->Value .isZero ()) ? DataType.getSizeInBytes () : 0 ;
4107+ };
4108+
4109+ auto PtrAndOffset = getPtrAndConstantOffsetFromStore (&StMI, MRI);
4110+ if (!PtrAndOffset)
4111+ return false ;
4112+ const auto [Ptr, Offset] = *PtrAndOffset;
4113+
4114+ const unsigned ZeroBytes = GetZeroStoreSizeInBytes (StMI);
4115+ if (!ZeroBytes)
4116+ return false ;
4117+ int64_t ExpectedOffset = Offset + ZeroBytes;
4118+
4119+ std::vector<GStore *> MatchedSeqStores;
4120+ for (MachineInstr &MI : make_range (std::next (StMI.getIterator ()),
4121+ StMI.getParent ()->instr_end ())) {
4122+ if (auto *CurrSt = dyn_cast<GStore>(&MI)) {
4123+ const unsigned ZeroBytes = GetZeroStoreSizeInBytes (*CurrSt);
4124+ if (!ZeroBytes) // Non-zero store.
4125+ break ;
4126+
4127+ auto PtrAndOffset = getPtrAndConstantOffsetFromStore (CurrSt, MRI);
4128+ if (!PtrAndOffset) // Non-constant offset.
4129+ break ;
4130+
4131+ auto [CurrPtr, CurrOffset] = *PtrAndOffset;
4132+
4133+ // Pointers are different or we have non-linear store.
4134+ if ((CurrPtr != Ptr) || (ExpectedOffset != CurrOffset))
4135+ break ;
4136+
4137+ MatchedSeqStores.push_back (CurrSt);
4138+ ExpectedOffset += ZeroBytes;
4139+
4140+ } else if (MI.mayLoadOrStore () || MI.hasUnmodeledSideEffects ()) {
4141+ // Bailout to prevent problems related to store reordering.
4142+ break ;
4143+ }
4144+ }
4145+
4146+ if (MatchedSeqStores.empty ())
4147+ return false ;
4148+
4149+ const unsigned NumberOfBytes = ExpectedOffset - Offset;
4150+ // If we cannot fill a vector, skip, because we will scalarize again
4151+ // and this will be matched again in a loop. However, if we have at least two
4152+ // scalars to merge, go for it. The rationale for scalar merging is: If we
4153+ // have the first scalar store whose size is smaller then the alignment, by
4154+ // combining with the next, we have a chance of reducing the number of stores.
4155+ // For example:
4156+ // STORE i8 0, [p0] (Align 16)
4157+ // STORE i16 0, [p0+1]
4158+ // STORE i8 0, [p0+3]
4159+ // Will be transformed to:
4160+ // STORE i32 0, [p0] (Align 16)
4161+ if (IsVectorAlignment && NumberOfBytes < MinVectorStoreSize)
4162+ return false ;
4163+
4164+ // In a pessimistic case, for example:
4165+ // STORE i8 0, [p0] (Align 16)
4166+ // STORE i16 0, [p0+1]
4167+ // The result will be just "rotated":
4168+ // STORE i16 0, [p0] (Align 16)
4169+ // STORE i8 0, [p0+2]
4170+ // This will cause a loop. We prevent by restricting
4171+ // combinations that will expand again to the same
4172+ // types: 3 bytes.
4173+ if (NumberOfBytes == 3 )
4174+ return false ;
4175+
4176+ MatchInfo = [=, &StMI, &MRI, &Observer](MachineIRBuilder &B) {
4177+ auto &MF = B.getMF ();
4178+
4179+ MachineMemOperand *NewMMO = MF.getMachineMemOperand (
4180+ MMO->getPointerInfo (), MMO->getFlags (), 8 , MMOAlign);
4181+
4182+ const Register MemsetDataReg = B.buildConstant (LLT::scalar (8 ), 0 ).getReg (0 );
4183+ const Register MemsetCountReg =
4184+ B.buildConstant (LLT::scalar (20 ), NumberOfBytes).getReg (0 );
4185+ B.buildInstr (TargetOpcode::G_MEMSET, {},
4186+ {StMI.getPointerReg (), MemsetDataReg, MemsetCountReg})
4187+ .addImm (0 )
4188+ ->addMemOperand (MF, NewMMO);
4189+ Observer.erasingInstr (StMI);
4190+ StMI.eraseFromParent ();
4191+
4192+ // Tricky part: we cannot erase the matched stores, make them dead.
4193+ for (GStore *ToDeleteMI : MatchedSeqStores) {
4194+ Observer.changingInstr (*ToDeleteMI);
4195+ makeStoreDead (ToDeleteMI, B.getTII (), MRI);
4196+ Observer.changedInstr (*ToDeleteMI);
4197+ }
4198+ };
4199+
4200+ return true ;
4201+ }
0 commit comments