Skip to content

Commit d72b780

Browse files
committed
[AIEX] Add optimal Zero Store combiner
In the end, we try to reach an optimal number of stores using the sinnergy with memset expander.
1 parent 5661c17 commit d72b780

File tree

7 files changed

+251
-129
lines changed

7 files changed

+251
-129
lines changed

llvm/lib/Target/AIE/AIECombine.td

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,12 @@ def combine_peel_memset : GICombineRule<
234234
[{ return matchPeelMemset(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), Observer, ${matchinfo}); }]),
235235
(apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
236236

237+
def combine_pack_stores_into_memset : GICombineRule<
238+
(defs root:$root, build_fn_matchinfo:$matchinfo),
239+
(match (wip_match_opcode G_STORE): $root,
240+
[{ return matchSequentialStores(cast<GStore>(*${root}), MRI, Observer, ${matchinfo}); }]),
241+
(apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
242+
237243
// AIE-specifc combines (currently shared by AIE2 and AIE2P).
238244
def aie_additional_combines : GICombineGroup<[
239245
combine_unpad_vector,
@@ -254,7 +260,8 @@ def aie_additional_combines : GICombineGroup<[
254260
combine_load_const,
255261
combine_phi_undef,
256262
combine_align_memset,
257-
combine_peel_memset
263+
combine_peel_memset,
264+
combine_pack_stores_into_memset
258265
]>;
259266

260267
// AIE2P-specific combines.

llvm/lib/Target/AIE/AIECombinerHelper.cpp

Lines changed: 180 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3875,6 +3875,16 @@ template <uint64_t TargetAlign> constexpr bool matchAlignment(uint64_t Value) {
38753875
return isAligned(Align(TargetAlign), Value);
38763876
}
38773877

3878+
static std::optional<std::pair<Register, int64_t>>
3879+
getPtrAndConstantOffsetFromReg(Register PtrReg, MachineRegisterInfo &MRI) {
3880+
3881+
const MachineInstr *DefPtrReg = MRI.getVRegDef(PtrReg);
3882+
if (DefPtrReg->getOpcode() == TargetOpcode::G_PTR_ADD)
3883+
return getPtrAndConstantOffset(DefPtrReg, 1, MRI);
3884+
3885+
return std::make_pair(PtrReg, 0);
3886+
}
3887+
38783888
static bool isBasePointerWordAligned(Register BasePtr,
38793889
MachineRegisterInfo &MRI) {
38803890

@@ -3948,17 +3958,12 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
39483958
return false;
39493959
const uint64_t Initializer = CstInit->Value.getZExtValue();
39503960

3951-
unsigned Offset = 0;
3952-
const MachineInstr *DefPtrReg = MRI.getVRegDef(PtrReg);
3953-
if (DefPtrReg->getOpcode() == TargetOpcode::G_PTR_ADD) {
3954-
const auto RegAndOffset = getPtrAndConstantOffset(DefPtrReg, 1, MRI);
3961+
const auto RegAndOffset = getPtrAndConstantOffsetFromReg(PtrReg, MRI);
3962+
if (!RegAndOffset)
3963+
return false;
39553964

3956-
if (!RegAndOffset)
3957-
return false;
3958-
3959-
Offset = RegAndOffset->second;
3960-
PtrReg = RegAndOffset->first;
3961-
}
3965+
const int64_t Offset = RegAndOffset->second;
3966+
PtrReg = RegAndOffset->first;
39623967

39633968
// Next step is to prove that the base pointer is word-aligned.
39643969
// As we cannot assume, we can search for aligned uses of the base pointer.
@@ -4029,3 +4034,168 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
40294034

40304035
return true;
40314036
}
4037+
4038+
static std::optional<std::pair<Register, int64_t>>
4039+
getPtrAndConstantOffsetFromStore(const GStore *StMI, MachineRegisterInfo &MRI) {
4040+
return getPtrAndConstantOffsetFromReg(StMI->getPointerReg(), MRI);
4041+
}
4042+
4043+
// To make a store dead, we convert it to a dead G_ADD and let DCE to do the
4044+
// removal
4045+
static void makeStoreDead(GStore *StMI, const TargetInstrInfo &TII,
4046+
MachineRegisterInfo &MRI) {
4047+
MachineFunction &MF = *StMI->getMF();
4048+
const Register DataReg = StMI->getValueReg();
4049+
StMI->dropMemRefs(MF);
4050+
for (int I = StMI->getNumOperands() - 1; I >= 0; I--)
4051+
StMI->removeOperand(I);
4052+
StMI->setDesc(TII.get(TargetOpcode::G_ADD));
4053+
StMI->addOperand(
4054+
MachineOperand::CreateReg(MRI.cloneVirtualRegister(DataReg), true));
4055+
StMI->addOperand(MachineOperand::CreateReg(DataReg, false));
4056+
StMI->addOperand(MachineOperand::CreateReg(DataReg, false));
4057+
}
4058+
4059+
// This combiner tries to pack sequential zero stores into memsets.
4060+
// The goal is to reach an optimal number of stores provided we
4061+
// use it synergically with the memset expand combiner.
4062+
bool llvm::matchSequentialStores(GStore &StMI, MachineRegisterInfo &MRI,
4063+
GISelChangeObserver &Observer,
4064+
BuildFnTy &MatchInfo) {
4065+
4066+
if (!MemsetOptimizations)
4067+
return false;
4068+
4069+
const uint64_t MinVectorStoreSize = 16;
4070+
MachineMemOperand *MMO = StMI.memoperands().front();
4071+
4072+
if (!MMO)
4073+
return false;
4074+
const Align MMOAlign = MMO->getAlign();
4075+
4076+
if (MMOAlign.value() < 4)
4077+
return false;
4078+
4079+
const Register DataReg = StMI.getValueReg();
4080+
const LLT DataType = MRI.getType(DataReg);
4081+
4082+
// Small alignments are less interesting, we are trying to match
4083+
// vector stores here. Even though we can match some byte/short
4084+
// stores to word stores.
4085+
const bool IsVectorAlignment = MMOAlign.value() >= MinVectorStoreSize;
4086+
// We can merge over aligned small types, provided that
4087+
// the root of the memset (first store) is a byte or short
4088+
// store. The goal is to fold this store with the next ones.
4089+
if (!IsVectorAlignment && DataType.getSizeInBytes() >= 4)
4090+
return false;
4091+
4092+
// A count of zero means that we are not storing zero at all.
4093+
auto GetZeroStoreSizeInBytes = [&](GStore &CurrSt) -> unsigned {
4094+
const Register DataReg = CurrSt.getValueReg();
4095+
const LLT DataType = MRI.getType(DataReg);
4096+
MachineMemOperand *CurrMMO = CurrSt.memoperands().front();
4097+
4098+
if (!CurrMMO)
4099+
return 0;
4100+
const Align CurrMMOAlign = MMO->getAlign();
4101+
4102+
// We already have a vector store, don't merge it.
4103+
if (DataType.getSizeInBytes() >= CurrMMOAlign.value())
4104+
return 0;
4105+
auto Cst = getIConstantVRegValWithLookThrough(DataReg, MRI);
4106+
return (Cst && Cst->Value.isZero()) ? DataType.getSizeInBytes() : 0;
4107+
};
4108+
4109+
auto PtrAndOffset = getPtrAndConstantOffsetFromStore(&StMI, MRI);
4110+
if (!PtrAndOffset)
4111+
return false;
4112+
const auto [Ptr, Offset] = *PtrAndOffset;
4113+
4114+
const unsigned ZeroBytes = GetZeroStoreSizeInBytes(StMI);
4115+
if (!ZeroBytes)
4116+
return false;
4117+
int64_t ExpectedOffset = Offset + ZeroBytes;
4118+
4119+
std::vector<GStore *> MatchedSeqStores;
4120+
for (MachineInstr &MI : make_range(std::next(StMI.getIterator()),
4121+
StMI.getParent()->instr_end())) {
4122+
if (auto *CurrSt = dyn_cast<GStore>(&MI)) {
4123+
const unsigned ZeroBytes = GetZeroStoreSizeInBytes(*CurrSt);
4124+
if (!ZeroBytes) // Non-zero store.
4125+
break;
4126+
4127+
auto PtrAndOffset = getPtrAndConstantOffsetFromStore(CurrSt, MRI);
4128+
if (!PtrAndOffset) // Non-constant offset.
4129+
break;
4130+
4131+
auto [CurrPtr, CurrOffset] = *PtrAndOffset;
4132+
4133+
// Pointers are different or we have non-linear store.
4134+
if ((CurrPtr != Ptr) || (ExpectedOffset != CurrOffset))
4135+
break;
4136+
4137+
MatchedSeqStores.push_back(CurrSt);
4138+
ExpectedOffset += ZeroBytes;
4139+
4140+
} else if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) {
4141+
// Bailout to prevent problems related to store reordering.
4142+
break;
4143+
}
4144+
}
4145+
4146+
if (MatchedSeqStores.empty())
4147+
return false;
4148+
4149+
const unsigned NumberOfBytes = ExpectedOffset - Offset;
4150+
// If we cannot fill a vector, skip, because we will scalarize again
4151+
// and this will be matched again in a loop. However, if we have at least two
4152+
// scalars to merge, go for it. The rationale for scalar merging is: If we
4153+
// have the first scalar store whose size is smaller then the alignment, by
4154+
// combining with the next, we have a chance of reducing the number of stores.
4155+
// For example:
4156+
// STORE i8 0, [p0] (Align 16)
4157+
// STORE i16 0, [p0+1]
4158+
// STORE i8 0, [p0+3]
4159+
// Will be transformed to:
4160+
// STORE i32 0, [p0] (Align 16)
4161+
if (IsVectorAlignment && NumberOfBytes < MinVectorStoreSize)
4162+
return false;
4163+
4164+
// In a pessimistic case, for example:
4165+
// STORE i8 0, [p0] (Align 16)
4166+
// STORE i16 0, [p0+1]
4167+
// The result will be just "rotated":
4168+
// STORE i16 0, [p0] (Align 16)
4169+
// STORE i8 0, [p0+2]
4170+
// This will cause a loop. We prevent by restricting
4171+
// combinations that will expand again to the same
4172+
// types: 3 bytes.
4173+
if (NumberOfBytes == 3)
4174+
return false;
4175+
4176+
MatchInfo = [=, &StMI, &MRI, &Observer](MachineIRBuilder &B) {
4177+
auto &MF = B.getMF();
4178+
4179+
MachineMemOperand *NewMMO = MF.getMachineMemOperand(
4180+
MMO->getPointerInfo(), MMO->getFlags(), 8, MMOAlign);
4181+
4182+
const Register MemsetDataReg = B.buildConstant(LLT::scalar(8), 0).getReg(0);
4183+
const Register MemsetCountReg =
4184+
B.buildConstant(LLT::scalar(20), NumberOfBytes).getReg(0);
4185+
B.buildInstr(TargetOpcode::G_MEMSET, {},
4186+
{StMI.getPointerReg(), MemsetDataReg, MemsetCountReg})
4187+
.addImm(0)
4188+
->addMemOperand(MF, NewMMO);
4189+
Observer.erasingInstr(StMI);
4190+
StMI.eraseFromParent();
4191+
4192+
// Tricky part: we cannot erase the matched stores, make them dead.
4193+
for (GStore *ToDeleteMI : MatchedSeqStores) {
4194+
Observer.changingInstr(*ToDeleteMI);
4195+
makeStoreDead(ToDeleteMI, B.getTII(), MRI);
4196+
Observer.changedInstr(*ToDeleteMI);
4197+
}
4198+
};
4199+
4200+
return true;
4201+
}

llvm/lib/Target/AIE/AIECombinerHelper.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,9 @@ bool matchAlignMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
295295
bool matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
296296
const AIEBaseInstrInfo &TII, GISelChangeObserver &Observer,
297297
BuildFnTy &MatchInfo);
298+
299+
bool matchSequentialStores(GStore &MI, MachineRegisterInfo &MRI,
300+
GISelChangeObserver &Observer, BuildFnTy &MatchInfo);
298301
} // namespace llvm
299302

300303
#endif

llvm/test/CodeGen/AIE/GlobalISel/combine-memset-stack-align.mir

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -149,26 +149,16 @@ body: |
149149
; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C]](s32)
150150
; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR]](<16 x s32>)
151151
; CHECK-NEXT: G_STORE [[AIE_UNPAD_VECTOR]](<8 x s32>), [[FRAME_INDEX]](p0) :: (store (<8 x s32>) into %stack.0.object0)
152-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
153-
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
154-
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s20)
155-
; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0.object0 + 32, align 32)
152+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
153+
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s20)
154+
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
155+
; CHECK-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C2]](s32)
156+
; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR1]](<16 x s32>)
157+
; CHECK-NEXT: G_STORE [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[PTR_ADD]](p0) :: (store (<4 x s32>) into %stack.0.object0 + 32, align 32)
156158
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
157-
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 36
159+
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
158160
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s20)
159-
; CHECK-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 36, basealign 32)
160-
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
161-
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 40
162-
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s20)
163-
; CHECK-NEXT: G_STORE [[C5]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0.object0 + 40, align 8, basealign 32)
164-
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
165-
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s20) = G_CONSTANT i20 44
166-
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s20)
167-
; CHECK-NEXT: G_STORE [[C7]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0.object0 + 44, basealign 32)
168-
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
169-
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
170-
; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s20)
171-
; CHECK-NEXT: G_STORE [[C9]](s32), [[PTR_ADD4]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
161+
; CHECK-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
172162
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
173163
; CHECK-NEXT: $p0 = COPY [[FRAME_INDEX]](p0)
174164
; CHECK-NEXT: PseudoJL @prevent_dce, csr_aie2p, implicit-def $lr, implicit $p0
@@ -181,26 +171,16 @@ body: |
181171
; LEGALIZED-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C]](s32)
182172
; LEGALIZED-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR]](<16 x s32>)
183173
; LEGALIZED-NEXT: G_STORE [[AIE_UNPAD_VECTOR]](<8 x s32>), [[FRAME_INDEX]](p0) :: (store (<8 x s32>) into %stack.0.object0)
184-
; LEGALIZED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
185-
; LEGALIZED-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
186-
; LEGALIZED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s20)
187-
; LEGALIZED-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0.object0 + 32, align 32)
174+
; LEGALIZED-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
175+
; LEGALIZED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s20)
176+
; LEGALIZED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
177+
; LEGALIZED-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C2]](s32)
178+
; LEGALIZED-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR1]](<16 x s32>)
179+
; LEGALIZED-NEXT: G_STORE [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[PTR_ADD]](p0) :: (store (<4 x s32>) into %stack.0.object0 + 32, align 32)
188180
; LEGALIZED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
189-
; LEGALIZED-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 36
181+
; LEGALIZED-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
190182
; LEGALIZED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s20)
191-
; LEGALIZED-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 36, basealign 32)
192-
; LEGALIZED-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
193-
; LEGALIZED-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 40
194-
; LEGALIZED-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s20)
195-
; LEGALIZED-NEXT: G_STORE [[C5]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0.object0 + 40, align 8, basealign 32)
196-
; LEGALIZED-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
197-
; LEGALIZED-NEXT: [[C8:%[0-9]+]]:_(s20) = G_CONSTANT i20 44
198-
; LEGALIZED-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s20)
199-
; LEGALIZED-NEXT: G_STORE [[C7]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0.object0 + 44, basealign 32)
200-
; LEGALIZED-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
201-
; LEGALIZED-NEXT: [[C10:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
202-
; LEGALIZED-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s20)
203-
; LEGALIZED-NEXT: G_STORE [[C9]](s32), [[PTR_ADD4]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
183+
; LEGALIZED-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
204184
; LEGALIZED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
205185
; LEGALIZED-NEXT: $p0 = COPY [[FRAME_INDEX]](p0)
206186
; LEGALIZED-NEXT: PseudoJL @prevent_dce, csr_aie2p, implicit-def $lr, implicit $p0

0 commit comments

Comments
 (0)