cmd/compile: optimize liveness in stackalloc

DanielMorsing · gopherbot · commit 4e761b9a181a · 2025-11-12T10:02:27.000-08:00
The stackalloc code needs to run a liveness pass to build the interference graph between stack slots. Because the values that we need liveness over is so sparse, we can optimize the analysis by using a path exploration algorithm rather than a iterative dataflow one In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile. Change-Id: I765ace87d5e8aae177e65eb63da482e3d698bea7 Reviewed-on: https://go-review.googlesource.com/c/go/+/718540 Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
diff --git a/src/cmd/compile/internal/ssa/stackalloc.go b/src/cmd/compile/internal/ssa/stackalloc.go
@@ -56,10 +56,35 @@ func putStackAllocState(s *stackAllocState) {
 }
 
 type stackValState struct {
-	typ      *types.Type
-	spill    *Value
-	needSlot bool
-	isArg    bool
+	typ       *types.Type
+	spill     *Value
+	needSlot  bool
+	isArg     bool
+	defBlock  ID
+	useBlocks []stackUseBlock
+}
+
+// addUseBlock adds a block to the set of blocks that uses this value.
+// Note that we only loosely enforce the set property by checking the last
+// block that was appended to the list and duplicates may occur.
+// Because we add values block by block (barring phi-nodes), the number of duplicates is
+// small and we deduplicate as part of the liveness algorithm later anyway.
+func (sv *stackValState) addUseBlock(b *Block, liveout bool) {
+	entry := stackUseBlock{
+		b:       b,
+		liveout: liveout,
+	}
+	if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry {
+		sv.useBlocks = append(sv.useBlocks, stackUseBlock{
+			b:       b,
+			liveout: liveout,
+		})
+	}
+}
+
+type stackUseBlock struct {
+	b       *Block
+	liveout bool
 }
 
 // stackalloc allocates storage in the stack frame for
@@ -99,6 +124,7 @@ func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
 			s.values[v.ID].typ = v.Type
 			s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack
 			s.values[v.ID].isArg = hasAnyArgOp(v)
+			s.values[v.ID].defBlock = b.ID
 			if f.pass.debug > stackDebug && s.values[v.ID].needSlot {
 				fmt.Printf("%s needs a stack slot\n", v)
 			}
@@ -291,80 +317,89 @@ func (s *stackAllocState) stackalloc() {
 
 // computeLive computes a map from block ID to a list of
 // stack-slot-needing value IDs live at the end of that block.
-// TODO: this could be quadratic if lots of variables are live across lots of
-// basic blocks. Figure out a way to make this function (or, more precisely, the user
-// of this function) require only linear size & time.
 func (s *stackAllocState) computeLive(spillLive [][]ID) {
-	s.live = make([][]ID, s.f.NumBlocks())
-	var phis []*Value
-	live := s.f.newSparseSet(s.f.NumValues())
-	defer s.f.retSparseSet(live)
-	t := s.f.newSparseSet(s.f.NumValues())
-	defer s.f.retSparseSet(t)
-
-	// Instead of iterating over f.Blocks, iterate over their postordering.
-	// Liveness information flows backward, so starting at the end
-	// increases the probability that we will stabilize quickly.
-	po := s.f.postorder()
-	for {
-		changed := false
-		for _, b := range po {
-			// Start with known live values at the end of the block
-			live.clear()
-			live.addAll(s.live[b.ID])
-
-			// Propagate backwards to the start of the block
-			phis = phis[:0]
-			for i := len(b.Values) - 1; i >= 0; i-- {
-				v := b.Values[i]
-				live.remove(v.ID)
-				if v.Op == OpPhi {
-					// Save phi for later.
-					// Note: its args might need a stack slot even though
-					// the phi itself doesn't. So don't use needSlot.
-					if !v.Type.IsMemory() && !v.Type.IsVoid() {
-						phis = append(phis, v)
-					}
-					continue
-				}
-				for _, a := range v.Args {
-					if s.values[a.ID].needSlot {
-						live.add(a.ID)
-					}
-				}
-			}
 
-			// for each predecessor of b, expand its list of live-at-end values
-			// invariant: s contains the values live at the start of b (excluding phi inputs)
-			for i, e := range b.Preds {
-				p := e.b
-				t.clear()
-				t.addAll(s.live[p.ID])
-				t.addAll(live.contents())
-				t.addAll(spillLive[p.ID])
-				for _, v := range phis {
-					a := v.Args[i]
-					if s.values[a.ID].needSlot {
-						t.add(a.ID)
-					}
-					if spill := s.values[a.ID].spill; spill != nil {
+	// Because values using stack slots are few and far inbetween
+	// (compared to the set of all values), we use a path exploration
+	// algorithm to calculate liveness here.
+	f := s.f
+	for _, b := range f.Blocks {
+		for _, spillvid := range spillLive[b.ID] {
+			val := &s.values[spillvid]
+			val.addUseBlock(b, true)
+		}
+		for _, v := range b.Values {
+			for i, a := range v.Args {
+				val := &s.values[a.ID]
+				useBlock := b
+				forceLiveout := false
+				if v.Op == OpPhi {
+					useBlock = b.Preds[i].b
+					forceLiveout = true
+					if spill := val.spill; spill != nil {
 						//TODO: remove?  Subsumed by SpillUse?
-						t.add(spill.ID)
+						s.values[spill.ID].addUseBlock(useBlock, true)
 					}
 				}
-				if t.size() == len(s.live[p.ID]) {
+				if !val.needSlot {
 					continue
 				}
-				// grow p's live set
-				s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
-				changed = true
+				val.addUseBlock(useBlock, forceLiveout)
 			}
 		}
+	}
 
-		if !changed {
-			break
+	s.live = make([][]ID, f.NumBlocks())
+	push := func(bid, vid ID) {
+		l := s.live[bid]
+		if l == nil || l[len(l)-1] != vid {
+			l = append(l, vid)
+			s.live[bid] = l
 		}
 	}
+	// TODO: If we can help along the interference graph by calculating livein sets,
+	// we can do so trivially by turning this sparse set into an array of arrays
+	// and checking the top for the current value instead of inclusion in the sparse set.
+	seen := f.newSparseSet(f.NumBlocks())
+	defer f.retSparseSet(seen)
+	// instead of pruning out duplicate blocks when we build the useblocks slices
+	// or when we add them to the queue, rely on the seen set to stop considering
+	// them. This is slightly faster than building the workqueues as sets
+	//
+	// However, this means that the queue can grow larger than the number of blocks,
+	// usually in very short functions. Returning a slice with values appended beyond the
+	// original allocation can corrupt the allocator state, so cap the queue and return
+	// the originally allocated slice regardless.
+	allocedBqueue := f.Cache.allocBlockSlice(f.NumBlocks())
+	defer f.Cache.freeBlockSlice(allocedBqueue)
+	bqueue := allocedBqueue[:0:f.NumBlocks()]
+
+	for vid, v := range s.values {
+		if !v.needSlot {
+			continue
+		}
+		seen.clear()
+		bqueue = bqueue[:0]
+		for _, b := range v.useBlocks {
+			if b.liveout {
+				push(b.b.ID, ID(vid))
+			}
+			bqueue = append(bqueue, b.b)
+		}
+		for len(bqueue) > 0 {
+			work := bqueue[len(bqueue)-1]
+			bqueue = bqueue[:len(bqueue)-1]
+			if seen.contains(work.ID) || work.ID == v.defBlock {
+				continue
+			}
+			seen.add(work.ID)
+			for _, e := range work.Preds {
+				push(e.b.ID, ID(vid))
+				bqueue = append(bqueue, e.b)
+			}
+		}
+	}
+
 	if s.f.pass.debug > stackDebug {
 		for _, b := range s.f.Blocks {
 			fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])