llvm
diff --git a/‎llvm/lib/CodeGen/Spill2Reg.cpp‎
Lines changed: 130 additions & 5 deletions b/‎llvm/lib/CodeGen/Spill2Reg.cpp‎
Lines changed: 130 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll‎
Lines changed: 177 additions & 0 deletions b/‎llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll‎
Lines changed: 177 additions & 0 deletions
@@ -99,6 +99,9 @@ class Spill2Reg : public MachineFunctionPass {
   /// Helper for generateCode(). It eplaces stack spills or reloads with movs
   /// to \p LI.reg().
   void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg);
+  /// Updates the live-ins of MBBs after we emit the new spill2reg instructions
+  /// and the vector registers become live from register spills to reloads.
+  void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg);
   /// Updates \p LRU with the liveness of physical registers around the spills
   /// and reloads in \p Entry.
   void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU);
@@ -111,6 +114,9 @@ class Spill2Reg : public MachineFunctionPass {
 
   /// Map from a stack slot to the corresponding spills and reloads.
   DenseMap<int, StackSlotDataEntry> StackSlotData;
+  /// The registers used by each block (from LiveRegUnits). This is needed for
+  /// finding free physical registers in the generateCode().
+  DenseMap<const MachineBasicBlock *, LiveRegUnits> LRUs;
 
   MachineFunction *MF = nullptr;
   MachineRegisterInfo *MRI = nullptr;
@@ -169,7 +175,16 @@ void Spill2Reg::collectSpillsAndReloads() {
   // If any spill/reload for a stack slot is found not to be eligible for
   // spill-to-reg, then that stack slot is disabled.
   for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB) {
+    // Initialize AccumMBBLRU for keeping track of physical registers used
+    // across the whole MBB.
+    LiveRegUnits AccumMBBLRU(*TRI);
+    AccumMBBLRU.addLiveOuts(MBB);
+
+    // Collect spills/reloads
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      // Update the LRU state as we move upwards.
+      AccumMBBLRU.accumulate(MI);
+
       int StackSlot;
       if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) {
         MachineInstr *Spill = &MI;
@@ -203,6 +218,8 @@ void Spill2Reg::collectSpillsAndReloads() {
           }
       }
     }
+
+    LRUs.insert(std::make_pair(&MBB, AccumMBBLRU));
   }
 }
 
@@ -228,6 +245,26 @@ Spill2Reg::tryGetFreePhysicalReg(const TargetRegisterClass *RegClass,
   return std::nullopt;
 }
 
+/// Perform a bottom-up depth-first traversal from \p MBB at \p MI towards its
+/// predecessors blocks. Visited marks the visited blocks. \p Fn is the
+/// callback function called in pre-order. If \p Fn returns true we stop the
+/// traversal.
+// TODO: Use df_iterator
+static void DFS(MachineBasicBlock *MBB, DenseSet<MachineBasicBlock *> &Visited,
+                std::function<bool(MachineBasicBlock *)> Fn) {
+  // Skip visited to avoid infinite loops.
+  if (Visited.count(MBB))
+    return;
+  Visited.insert(MBB);
+
+  // Preorder.
+  if (Fn(MBB))
+    return;
+
+  // Depth-first across predecessors.
+  for (MachineBasicBlock *PredMBB : MBB->predecessors())
+    DFS(PredMBB, Visited, Fn);
+}
 // Replace stack-based spills/reloads with register-based ones.
 void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
                                     Register VectorReg) {
@@ -236,10 +273,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
     assert(SpillData.MO->isReg() && "Expected register MO");
     Register OldReg = SpillData.MO->getReg();
 
-    MachineInstr *SpillToVector = TII->spill2RegInsertToVectorReg(
+    TII->spill2RegInsertToVectorReg(
         VectorReg, OldReg, SpillData.SpillBits, StackSpill->getParent(),
         /*InsertBeforeIt=*/StackSpill->getIterator(), TRI);
 
+    // Mark VectorReg as live in the instr's BB.
+    LRUs[StackSpill->getParent()].addReg(VectorReg);
+
     // Spill to stack is no longer needed.
     StackSpill->eraseFromParent();
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -250,10 +290,13 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
     assert(ReloadData.MO->isReg() && "Expected Reg MO");
     Register OldReg = ReloadData.MO->getReg();
 
-    MachineInstr *ReloadFromReg = TII->spill2RegExtractFromVectorReg(
+    TII->spill2RegExtractFromVectorReg(
         OldReg, VectorReg, ReloadData.SpillBits, StackReload->getParent(),
         /*InsertBeforeIt=*/StackReload->getIterator(), TRI);
 
+    // Mark VectorReg as live in the instr's BB.
+    LRUs[StackReload->getParent()].addReg(VectorReg);
+
     // Reload from stack is no longer needed.
     StackReload->eraseFromParent();
     assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()");
@@ -262,7 +305,86 @@ void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry,
 
 void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry,
                                   LiveRegUnits &LRU) {
-  // TODO: Unimplemented
+  // Collect the parent MBBs of Spills for fast lookup.
+  DenseSet<MachineBasicBlock *> SpillMBBs(Entry.Spills.size());
+  DenseSet<MachineInstr *> Spills(Entry.Spills.size());
+  for (const auto &Data : Entry.Spills) {
+    SpillMBBs.insert(Data.MI->getParent());
+    Spills.insert(Data.MI);
+  }
+
+  /// Walks up the instructions in \p Reload's block, stopping at a spill if
+  /// found. \Returns true if a spill was found, false otherwise.
+  auto AccumulateLRUUntilSpillFn = [&Spills, &SpillMBBs](MachineInstr *Reload,
+                                                         LiveRegUnits &LRU) {
+    MachineBasicBlock *MBB = Reload->getParent();
+    bool IsSpillBlock = SpillMBBs.count(MBB);
+    // Add all MBB's live-outs.
+    LRU.addLiveOuts(*MBB);
+    // Else walk up the BB, starting from MI, looking for any spill.
+    for (MachineInstr *CurrMI = Reload; CurrMI != nullptr;
+         CurrMI = CurrMI->getPrevNode()) {
+      LRU.accumulate(*CurrMI);
+      // If a spill is found then return true to end the recursion.
+      if (IsSpillBlock && Spills.count(CurrMI))
+        return true;
+    }
+    return false;
+  };
+
+  // Helper for the traversal. It accumulates all register units used in \p
+  // MBB from \p MI upwards. It returns true once a spill is found.
+  auto AccumulateLRUFn = [&SpillMBBs, &LRU, AccumulateLRUUntilSpillFn,
+                          this](MachineBasicBlock *MBB) {
+    if (SpillMBBs.count(MBB)) {
+      // If this is a spill block, then walk bottom-up until the spill.
+      assert(!MBB->empty() && "How can it be a spill block and empty?");
+      bool FoundSpill = AccumulateLRUUntilSpillFn(&*MBB->rbegin(), LRU);
+      assert(FoundSpill && "Spill block but we couldn't find spill!");
+      // We return true to stop the recursion.
+      return true;
+    }
+    // Else this is an intermediate block between the spills and reloads and
+    // there is no spill in it, then use the pre-computed LRU to avoid walking
+    // it again. This improves compilation time.
+    LRU.addUnits(LRUs[MBB].getBitVector());
+    // We return false to continue the recursion.
+    return false;
+  };
+
+  /// \Returns the LiveRegUnits at `Reload` by stepping back the BB.
+  auto GetReloadLRU = [this](MachineInstr *Reload) {
+    LiveRegUnits ReloadLRU(*TRI);
+    MachineBasicBlock *MBB = Reload->getParent();
+    ReloadLRU.addLiveOuts(*MBB);
+    // Start at the bottom of the BB and walk up until we find `Reload`.
+    for (MachineInstr &MI : llvm::reverse(*MBB)) {
+      if (&MI == Reload)
+        break;
+      // TODO: Check if this should be accumulate() instead of stepBackward().
+      ReloadLRU.stepBackward(MI);
+    }
+    return ReloadLRU;
+  };
+
+  // Start from each Reload and walk up the CFG with a depth-first traversal,
+  // looking for spills. Upon finding a spill we don't go beyond that point. In
+  // the meantime we accumulate the registers used. This is then used to find
+  // free physical registes.
+  DenseSet<MachineBasicBlock *> Visited;
+  for (const auto &ReloadData : Entry.Reloads) {
+    MachineInstr *Reload = ReloadData.MI;
+    // Add the Reload's LRU to the total LRU for the whole Spill-Reload range.
+    LiveRegUnits ReloadLRU = GetReloadLRU(Reload);
+    bool FoundSpill = AccumulateLRUUntilSpillFn(Reload, ReloadLRU);
+    LRU.addUnits(ReloadLRU.getBitVector());
+
+    // Traverse the CFG bottom-up accumulating LRUs until we reach the Spills.
+    if (!FoundSpill) {
+      for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors())
+        DFS(PredMBB, Visited, AccumulateLRUFn);
+    }
+  }
 }
 
 void Spill2Reg::generateCode() {
@@ -293,7 +415,10 @@ void Spill2Reg::generateCode() {
   }
 }
 
-void Spill2Reg::cleanup() { StackSlotData.clear(); }
+void Spill2Reg::cleanup() {
+  StackSlotData.clear();
+  LRUs.clear();
+}
 
 bool Spill2Reg::run() {
   // Walk over each instruction in the code keeping track of the processor's
 
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s
+
+; End-to-end check that Spill2Reg works with 16-bit registers.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@D0 = dso_local local_unnamed_addr global i16 0, align 4
+@D1 = dso_local local_unnamed_addr global i16 0, align 4
+@D2 = dso_local local_unnamed_addr global i16 0, align 4
+@D3 = dso_local local_unnamed_addr global i16 0, align 4
+@D4 = dso_local local_unnamed_addr global i16 0, align 4
+@D5 = dso_local local_unnamed_addr global i16 0, align 4
+@D6 = dso_local local_unnamed_addr global i16 0, align 4
+@D7 = dso_local local_unnamed_addr global i16 0, align 4
+@D8 = dso_local local_unnamed_addr global i16 0, align 4
+@D9 = dso_local local_unnamed_addr global i16 0, align 4
+@D10 = dso_local local_unnamed_addr global i16 0, align 4
+@D11 = dso_local local_unnamed_addr global i16 0, align 4
+@D12 = dso_local local_unnamed_addr global i16 0, align 4
+@D13 = dso_local local_unnamed_addr global i16 0, align 4
+@D14 = dso_local local_unnamed_addr global i16 0, align 4
+@D15 = dso_local local_unnamed_addr global i16 0, align 4
+@D16 = dso_local local_unnamed_addr global i16 0, align 4
+@D17 = dso_local local_unnamed_addr global i16 0, align 4
+@D18 = dso_local local_unnamed_addr global i16 0, align 4
+@U0 = dso_local local_unnamed_addr global i16 0, align 4
+@U1 = dso_local local_unnamed_addr global i16 0, align 4
+@U2 = dso_local local_unnamed_addr global i16 0, align 4
+@U3 = dso_local local_unnamed_addr global i16 0, align 4
+@U4 = dso_local local_unnamed_addr global i16 0, align 4
+@U5 = dso_local local_unnamed_addr global i16 0, align 4
+@U6 = dso_local local_unnamed_addr global i16 0, align 4
+@U7 = dso_local local_unnamed_addr global i16 0, align 4
+@U8 = dso_local local_unnamed_addr global i16 0, align 4
+@U9 = dso_local local_unnamed_addr global i16 0, align 4
+@U10 = dso_local local_unnamed_addr global i16 0, align 4
+@U11 = dso_local local_unnamed_addr global i16 0, align 4
+@U12 = dso_local local_unnamed_addr global i16 0, align 4
+@U13 = dso_local local_unnamed_addr global i16 0, align 4
+@U14 = dso_local local_unnamed_addr global i16 0, align 4
+@U15 = dso_local local_unnamed_addr global i16 0, align 4
+@U16 = dso_local local_unnamed_addr global i16 0, align 4
+@U17 = dso_local local_unnamed_addr global i16 0, align 4
+@U18 = dso_local local_unnamed_addr global i16 0, align 4
+
+; Function Attrs: mustprogress noinline nounwind uwtable
+define dso_local void @_Z5spillv() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z5spillv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movzwl D0(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D1(%rip), %ecx
+; CHECK-NEXT:    movzwl D2(%rip), %edx
+; CHECK-NEXT:    movzwl D3(%rip), %esi
+; CHECK-NEXT:    movzwl D4(%rip), %edi
+; CHECK-NEXT:    movzwl D5(%rip), %r8d
+; CHECK-NEXT:    movzwl D6(%rip), %r9d
+; CHECK-NEXT:    movzwl D7(%rip), %r10d
+; CHECK-NEXT:    movzwl D8(%rip), %r11d
+; CHECK-NEXT:    movzwl D9(%rip), %ebx
+; CHECK-NEXT:    movzwl D10(%rip), %ebp
+; CHECK-NEXT:    movzwl D11(%rip), %r14d
+; CHECK-NEXT:    movzwl D12(%rip), %r15d
+; CHECK-NEXT:    movzwl D13(%rip), %r12d
+; CHECK-NEXT:    movzwl D14(%rip), %r13d
+; CHECK-NEXT:    movzwl D15(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D16(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D17(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    movzwl D18(%rip), %eax
+; CHECK-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U0(%rip)
+; CHECK-NEXT:    movw %cx, U1(%rip)
+; CHECK-NEXT:    movw %dx, U2(%rip)
+; CHECK-NEXT:    movw %si, U3(%rip)
+; CHECK-NEXT:    movw %di, U4(%rip)
+; CHECK-NEXT:    movw %r8w, U5(%rip)
+; CHECK-NEXT:    movw %r9w, U6(%rip)
+; CHECK-NEXT:    movw %r10w, U7(%rip)
+; CHECK-NEXT:    movw %r11w, U8(%rip)
+; CHECK-NEXT:    movw %bx, U9(%rip)
+; CHECK-NEXT:    movw %bp, U10(%rip)
+; CHECK-NEXT:    movw %r14w, U11(%rip)
+; CHECK-NEXT:    movw %r15w, U12(%rip)
+; CHECK-NEXT:    movw %r12w, U13(%rip)
+; CHECK-NEXT:    movw %r13w, U14(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U15(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U16(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U17(%rip)
+; CHECK-NEXT:    movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT:    movw %ax, U18(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i16, i16* @D0
+  %1 = load i16, i16* @D1
+  %2 = load i16, i16* @D2
+  %3 = load i16, i16* @D3
+  %4 = load i16, i16* @D4
+  %5 = load i16, i16* @D5
+  %6 = load i16, i16* @D6
+  %7 = load i16, i16* @D7
+  %8 = load i16, i16* @D8
+  %9 = load i16, i16* @D9
+  %10 = load i16, i16* @D10
+  %11 = load i16, i16* @D11
+  %12 = load i16, i16* @D12
+  %13 = load i16, i16* @D13
+  %14 = load i16, i16* @D14
+  %15 = load i16, i16* @D15
+  %16 = load i16, i16* @D16
+  %17 = load i16, i16* @D17
+  %18 = load i16, i16* @D18
+  call void asm sideeffect "", "~{memory}"() #1
+  store i16 %0, i16* @U0
+  store i16 %1, i16* @U1
+  store i16 %2, i16* @U2
+  store i16 %3, i16* @U3
+  store i16 %4, i16* @U4
+  store i16 %5, i16* @U5
+  store i16 %6, i16* @U6
+  store i16 %7, i16* @U7
+  store i16 %8, i16* @U8
+  store i16 %9, i16* @U9
+  store i16 %10, i16* @U10
+  store i16 %11, i16* @U11
+  store i16 %12, i16* @U12
+  store i16 %13, i16* @U13
+  store i16 %14, i16* @U14
+  store i16 %15, i16* @U15
+  store i16 %16, i16* @U16
+  store i16 %17, i16* @U17
+  store i16 %18, i16* @U18
+  ret void
+}
+
+attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nounwind }