Xilinx
diff --git a/‎llvm/lib/Target/AIE/AIECombine.td‎
Lines changed: 8 additions & 1 deletion b/‎llvm/lib/Target/AIE/AIECombine.td‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎llvm/lib/Target/AIE/AIECombinerHelper.cpp‎
Lines changed: 180 additions & 10 deletions b/‎llvm/lib/Target/AIE/AIECombinerHelper.cpp‎
Lines changed: 180 additions & 10 deletions
diff --git a/‎llvm/lib/Target/AIE/AIECombinerHelper.h‎
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AIE/AIECombinerHelper.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AIE/GlobalISel/combine-memset-stack-align.mir‎
Lines changed: 16 additions & 36 deletions b/‎llvm/test/CodeGen/AIE/GlobalISel/combine-memset-stack-align.mir‎
Lines changed: 16 additions & 36 deletions
@@ -234,6 +234,12 @@ def combine_peel_memset : GICombineRule<
          [{ return matchPeelMemset(*${root}, MRI, (const AIEBaseInstrInfo &)B.getTII(), Observer, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
 
+def combine_pack_stores_into_memset : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_STORE): $root,
+         [{ return matchSequentialStores(cast<GStore>(*${root}), MRI, Observer, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // AIE-specifc combines (currently shared by AIE2 and AIE2P).
 def aie_additional_combines : GICombineGroup<[
   combine_unpad_vector,
@@ -254,7 +260,8 @@ def aie_additional_combines : GICombineGroup<[
   combine_load_const,
   combine_phi_undef,
   combine_align_memset,
-  combine_peel_memset
+  combine_peel_memset,
+  combine_pack_stores_into_memset
 ]>;
 
 // AIE2P-specific combines.
 
@@ -3875,6 +3875,16 @@ template <uint64_t TargetAlign> constexpr bool matchAlignment(uint64_t Value) {
   return isAligned(Align(TargetAlign), Value);
 }
 
+static std::optional<std::pair<Register, int64_t>>
+getPtrAndConstantOffsetFromReg(Register PtrReg, MachineRegisterInfo &MRI) {
+
+  const MachineInstr *DefPtrReg = MRI.getVRegDef(PtrReg);
+  if (DefPtrReg->getOpcode() == TargetOpcode::G_PTR_ADD)
+    return getPtrAndConstantOffset(DefPtrReg, 1, MRI);
+
+  return std::make_pair(PtrReg, 0);
+}
+
 static bool isBasePointerWordAligned(Register BasePtr,
                                      MachineRegisterInfo &MRI) {
 
@@ -3948,17 +3958,12 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
     return false;
   const uint64_t Initializer = CstInit->Value.getZExtValue();
 
-  unsigned Offset = 0;
-  const MachineInstr *DefPtrReg = MRI.getVRegDef(PtrReg);
-  if (DefPtrReg->getOpcode() == TargetOpcode::G_PTR_ADD) {
-    const auto RegAndOffset = getPtrAndConstantOffset(DefPtrReg, 1, MRI);
+  const auto RegAndOffset = getPtrAndConstantOffsetFromReg(PtrReg, MRI);
+  if (!RegAndOffset)
+    return false;
 
-    if (!RegAndOffset)
-      return false;
-
-    Offset = RegAndOffset->second;
-    PtrReg = RegAndOffset->first;
-  }
+  const int64_t Offset = RegAndOffset->second;
+  PtrReg = RegAndOffset->first;
 
   // Next step is to prove that the base pointer is word-aligned.
   // As we cannot assume, we can search for aligned uses of the base pointer.
@@ -4029,3 +4034,168 @@ bool llvm::matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
 
   return true;
 }
+
+static std::optional<std::pair<Register, int64_t>>
+getPtrAndConstantOffsetFromStore(const GStore *StMI, MachineRegisterInfo &MRI) {
+  return getPtrAndConstantOffsetFromReg(StMI->getPointerReg(), MRI);
+}
+
+// To make a store dead, we convert it to a dead G_ADD and let DCE to do the
+// removal
+static void makeStoreDead(GStore *StMI, const TargetInstrInfo &TII,
+                          MachineRegisterInfo &MRI) {
+  MachineFunction &MF = *StMI->getMF();
+  const Register DataReg = StMI->getValueReg();
+  StMI->dropMemRefs(MF);
+  for (int I = StMI->getNumOperands() - 1; I >= 0; I--)
+    StMI->removeOperand(I);
+  StMI->setDesc(TII.get(TargetOpcode::G_ADD));
+  StMI->addOperand(
+      MachineOperand::CreateReg(MRI.cloneVirtualRegister(DataReg), true));
+  StMI->addOperand(MachineOperand::CreateReg(DataReg, false));
+  StMI->addOperand(MachineOperand::CreateReg(DataReg, false));
+}
+
+// This combiner tries to pack sequential zero stores into memsets.
+// The goal is to reach an optimal number of stores provided we
+// use it synergically with the memset expand combiner.
+bool llvm::matchSequentialStores(GStore &StMI, MachineRegisterInfo &MRI,
+                                 GISelChangeObserver &Observer,
+                                 BuildFnTy &MatchInfo) {
+
+  if (!MemsetOptimizations)
+    return false;
+
+  const uint64_t MinVectorStoreSize = 16;
+  MachineMemOperand *MMO = StMI.memoperands().front();
+
+  if (!MMO)
+    return false;
+  const Align MMOAlign = MMO->getAlign();
+
+  if (MMOAlign.value() < 4)
+    return false;
+
+  const Register DataReg = StMI.getValueReg();
+  const LLT DataType = MRI.getType(DataReg);
+
+  // Small alignments are less interesting, we are trying to match
+  // vector stores here. Even though we can match some byte/short
+  // stores to word stores.
+  const bool IsVectorAlignment = MMOAlign.value() >= MinVectorStoreSize;
+  // We can merge over aligned small types, provided that
+  // the root of the memset (first store) is a byte or short
+  // store. The goal is to fold this store with the next ones.
+  if (!IsVectorAlignment && DataType.getSizeInBytes() >= 4)
+    return false;
+
+  //  A count of zero means that we are not storing zero at all.
+  auto GetZeroStoreSizeInBytes = [&](GStore &CurrSt) -> unsigned {
+    const Register DataReg = CurrSt.getValueReg();
+    const LLT DataType = MRI.getType(DataReg);
+    MachineMemOperand *CurrMMO = CurrSt.memoperands().front();
+
+    if (!CurrMMO)
+      return 0;
+    const Align CurrMMOAlign = MMO->getAlign();
+
+    // We already have a vector store, don't merge it.
+    if (DataType.getSizeInBytes() >= CurrMMOAlign.value())
+      return 0;
+    auto Cst = getIConstantVRegValWithLookThrough(DataReg, MRI);
+    return (Cst && Cst->Value.isZero()) ? DataType.getSizeInBytes() : 0;
+  };
+
+  auto PtrAndOffset = getPtrAndConstantOffsetFromStore(&StMI, MRI);
+  if (!PtrAndOffset)
+    return false;
+  const auto [Ptr, Offset] = *PtrAndOffset;
+
+  const unsigned ZeroBytes = GetZeroStoreSizeInBytes(StMI);
+  if (!ZeroBytes)
+    return false;
+  int64_t ExpectedOffset = Offset + ZeroBytes;
+
+  std::vector<GStore *> MatchedSeqStores;
+  for (MachineInstr &MI : make_range(std::next(StMI.getIterator()),
+                                     StMI.getParent()->instr_end())) {
+    if (auto *CurrSt = dyn_cast<GStore>(&MI)) {
+      const unsigned ZeroBytes = GetZeroStoreSizeInBytes(*CurrSt);
+      if (!ZeroBytes) // Non-zero store.
+        break;
+
+      auto PtrAndOffset = getPtrAndConstantOffsetFromStore(CurrSt, MRI);
+      if (!PtrAndOffset) // Non-constant offset.
+        break;
+
+      auto [CurrPtr, CurrOffset] = *PtrAndOffset;
+
+      // Pointers are different or we have non-linear store.
+      if ((CurrPtr != Ptr) || (ExpectedOffset != CurrOffset))
+        break;
+
+      MatchedSeqStores.push_back(CurrSt);
+      ExpectedOffset += ZeroBytes;
+
+    } else if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) {
+      // Bailout to prevent problems related to store reordering.
+      break;
+    }
+  }
+
+  if (MatchedSeqStores.empty())
+    return false;
+
+  const unsigned NumberOfBytes = ExpectedOffset - Offset;
+  // If we cannot fill a vector, skip, because we will scalarize again
+  // and this will be matched again in a loop. However, if we have at least two
+  // scalars to merge, go for it. The rationale for scalar merging is: If we
+  // have the first scalar store whose size is smaller then the alignment, by
+  // combining with the next, we have a chance of reducing the number of stores.
+  // For example:
+  //   STORE i8 0, [p0] (Align 16)
+  //   STORE i16 0, [p0+1]
+  //   STORE i8 0, [p0+3]
+  // Will be transformed to:
+  //   STORE i32 0, [p0] (Align 16)
+  if (IsVectorAlignment && NumberOfBytes < MinVectorStoreSize)
+    return false;
+
+  // In a pessimistic case, for example:
+  //   STORE i8 0, [p0] (Align 16)
+  //   STORE i16 0, [p0+1]
+  // The result will be just "rotated":
+  //   STORE i16 0, [p0] (Align 16)
+  //   STORE i8 0, [p0+2]
+  // This will cause a loop. We prevent by restricting
+  // combinations that will expand again to the same
+  // types: 3 bytes.
+  if (NumberOfBytes == 3)
+    return false;
+
+  MatchInfo = [=, &StMI, &MRI, &Observer](MachineIRBuilder &B) {
+    auto &MF = B.getMF();
+
+    MachineMemOperand *NewMMO = MF.getMachineMemOperand(
+        MMO->getPointerInfo(), MMO->getFlags(), 8, MMOAlign);
+
+    const Register MemsetDataReg = B.buildConstant(LLT::scalar(8), 0).getReg(0);
+    const Register MemsetCountReg =
+        B.buildConstant(LLT::scalar(20), NumberOfBytes).getReg(0);
+    B.buildInstr(TargetOpcode::G_MEMSET, {},
+                 {StMI.getPointerReg(), MemsetDataReg, MemsetCountReg})
+        .addImm(0)
+        ->addMemOperand(MF, NewMMO);
+    Observer.erasingInstr(StMI);
+    StMI.eraseFromParent();
+
+    // Tricky part: we cannot erase the matched stores, make them dead.
+    for (GStore *ToDeleteMI : MatchedSeqStores) {
+      Observer.changingInstr(*ToDeleteMI);
+      makeStoreDead(ToDeleteMI, B.getTII(), MRI);
+      Observer.changedInstr(*ToDeleteMI);
+    }
+  };
+
+  return true;
+}
@@ -295,6 +295,9 @@ bool matchAlignMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
 bool matchPeelMemset(MachineInstr &MI, MachineRegisterInfo &MRI,
                      const AIEBaseInstrInfo &TII, GISelChangeObserver &Observer,
                      BuildFnTy &MatchInfo);
+
+bool matchSequentialStores(GStore &MI, MachineRegisterInfo &MRI,
+                           GISelChangeObserver &Observer, BuildFnTy &MatchInfo);
 } // namespace llvm
 
 #endif
@@ -149,26 +149,16 @@ body:             |
     ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C]](s32)
     ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR]](<16 x s32>)
     ; CHECK-NEXT: G_STORE [[AIE_UNPAD_VECTOR]](<8 x s32>), [[FRAME_INDEX]](p0) :: (store (<8 x s32>) into %stack.0.object0)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
-    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s20)
-    ; CHECK-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0.object0 + 32, align 32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s20)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C2]](s32)
+    ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR1]](<16 x s32>)
+    ; CHECK-NEXT: G_STORE [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[PTR_ADD]](p0) :: (store (<4 x s32>) into %stack.0.object0 + 32, align 32)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 36
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s20)
-    ; CHECK-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 36, basealign 32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 40
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s20)
-    ; CHECK-NEXT: G_STORE [[C5]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0.object0 + 40, align 8, basealign 32)
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s20) = G_CONSTANT i20 44
-    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s20)
-    ; CHECK-NEXT: G_STORE [[C7]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0.object0 + 44, basealign 32)
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
-    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s20)
-    ; CHECK-NEXT: G_STORE [[C9]](s32), [[PTR_ADD4]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
+    ; CHECK-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
     ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
     ; CHECK-NEXT: $p0 = COPY [[FRAME_INDEX]](p0)
     ; CHECK-NEXT: PseudoJL @prevent_dce, csr_aie2p, implicit-def $lr, implicit $p0
@@ -181,26 +171,16 @@ body:             |
     ; LEGALIZED-NEXT: [[AIE_BROADCAST_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C]](s32)
     ; LEGALIZED-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR]](<16 x s32>)
     ; LEGALIZED-NEXT: G_STORE [[AIE_UNPAD_VECTOR]](<8 x s32>), [[FRAME_INDEX]](p0) :: (store (<8 x s32>) into %stack.0.object0)
-    ; LEGALIZED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; LEGALIZED-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
-    ; LEGALIZED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s20)
-    ; LEGALIZED-NEXT: G_STORE [[C1]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0.object0 + 32, align 32)
+    ; LEGALIZED-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 32
+    ; LEGALIZED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C1]](s20)
+    ; LEGALIZED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; LEGALIZED-NEXT: [[AIE_BROADCAST_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_AIE_BROADCAST_VECTOR [[C2]](s32)
+    ; LEGALIZED-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[AIE_BROADCAST_VECTOR1]](<16 x s32>)
+    ; LEGALIZED-NEXT: G_STORE [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[PTR_ADD]](p0) :: (store (<4 x s32>) into %stack.0.object0 + 32, align 32)
     ; LEGALIZED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; LEGALIZED-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 36
+    ; LEGALIZED-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
     ; LEGALIZED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s20)
-    ; LEGALIZED-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 36, basealign 32)
-    ; LEGALIZED-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; LEGALIZED-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 40
-    ; LEGALIZED-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s20)
-    ; LEGALIZED-NEXT: G_STORE [[C5]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0.object0 + 40, align 8, basealign 32)
-    ; LEGALIZED-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; LEGALIZED-NEXT: [[C8:%[0-9]+]]:_(s20) = G_CONSTANT i20 44
-    ; LEGALIZED-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s20)
-    ; LEGALIZED-NEXT: G_STORE [[C7]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0.object0 + 44, basealign 32)
-    ; LEGALIZED-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; LEGALIZED-NEXT: [[C10:%[0-9]+]]:_(s20) = G_CONSTANT i20 48
-    ; LEGALIZED-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s20)
-    ; LEGALIZED-NEXT: G_STORE [[C9]](s32), [[PTR_ADD4]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
+    ; LEGALIZED-NEXT: G_STORE [[C3]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0.object0 + 48, align 16, basealign 32)
     ; LEGALIZED-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
     ; LEGALIZED-NEXT: $p0 = COPY [[FRAME_INDEX]](p0)
     ; LEGALIZED-NEXT: PseudoJL @prevent_dce, csr_aie2p, implicit-def $lr, implicit $p0