Skip to content

Commit 4e761b9

Browse files
DanielMorsinggopherbot
authored andcommitted
cmd/compile: optimize liveness in stackalloc
The stackalloc code needs to run a liveness pass to build the interference graph between stack slots. Because the values that we need liveness over is so sparse, we can optimize the analysis by using a path exploration algorithm rather than a iterative dataflow one In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile. Change-Id: I765ace87d5e8aae177e65eb63da482e3d698bea7 Reviewed-on: https://go-review.googlesource.com/c/go/+/718540 Reviewed-by: Keith Randall <[email protected]> Auto-Submit: Keith Randall <[email protected]> Reviewed-by: Junyang Shao <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent 956909f commit 4e761b9

File tree

1 file changed

+101
-66
lines changed

1 file changed

+101
-66
lines changed

src/cmd/compile/internal/ssa/stackalloc.go

Lines changed: 101 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,35 @@ func putStackAllocState(s *stackAllocState) {
5656
}
5757

5858
type stackValState struct {
59-
typ *types.Type
60-
spill *Value
61-
needSlot bool
62-
isArg bool
59+
typ *types.Type
60+
spill *Value
61+
needSlot bool
62+
isArg bool
63+
defBlock ID
64+
useBlocks []stackUseBlock
65+
}
66+
67+
// addUseBlock adds a block to the set of blocks that uses this value.
68+
// Note that we only loosely enforce the set property by checking the last
69+
// block that was appended to the list and duplicates may occur.
70+
// Because we add values block by block (barring phi-nodes), the number of duplicates is
71+
// small and we deduplicate as part of the liveness algorithm later anyway.
72+
func (sv *stackValState) addUseBlock(b *Block, liveout bool) {
73+
entry := stackUseBlock{
74+
b: b,
75+
liveout: liveout,
76+
}
77+
if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry {
78+
sv.useBlocks = append(sv.useBlocks, stackUseBlock{
79+
b: b,
80+
liveout: liveout,
81+
})
82+
}
83+
}
84+
85+
type stackUseBlock struct {
86+
b *Block
87+
liveout bool
6388
}
6489

6590
// stackalloc allocates storage in the stack frame for
@@ -99,6 +124,7 @@ func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
99124
s.values[v.ID].typ = v.Type
100125
s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack
101126
s.values[v.ID].isArg = hasAnyArgOp(v)
127+
s.values[v.ID].defBlock = b.ID
102128
if f.pass.debug > stackDebug && s.values[v.ID].needSlot {
103129
fmt.Printf("%s needs a stack slot\n", v)
104130
}
@@ -291,80 +317,89 @@ func (s *stackAllocState) stackalloc() {
291317

292318
// computeLive computes a map from block ID to a list of
293319
// stack-slot-needing value IDs live at the end of that block.
294-
// TODO: this could be quadratic if lots of variables are live across lots of
295-
// basic blocks. Figure out a way to make this function (or, more precisely, the user
296-
// of this function) require only linear size & time.
297320
func (s *stackAllocState) computeLive(spillLive [][]ID) {
298-
s.live = make([][]ID, s.f.NumBlocks())
299-
var phis []*Value
300-
live := s.f.newSparseSet(s.f.NumValues())
301-
defer s.f.retSparseSet(live)
302-
t := s.f.newSparseSet(s.f.NumValues())
303-
defer s.f.retSparseSet(t)
304-
305-
// Instead of iterating over f.Blocks, iterate over their postordering.
306-
// Liveness information flows backward, so starting at the end
307-
// increases the probability that we will stabilize quickly.
308-
po := s.f.postorder()
309-
for {
310-
changed := false
311-
for _, b := range po {
312-
// Start with known live values at the end of the block
313-
live.clear()
314-
live.addAll(s.live[b.ID])
315-
316-
// Propagate backwards to the start of the block
317-
phis = phis[:0]
318-
for i := len(b.Values) - 1; i >= 0; i-- {
319-
v := b.Values[i]
320-
live.remove(v.ID)
321-
if v.Op == OpPhi {
322-
// Save phi for later.
323-
// Note: its args might need a stack slot even though
324-
// the phi itself doesn't. So don't use needSlot.
325-
if !v.Type.IsMemory() && !v.Type.IsVoid() {
326-
phis = append(phis, v)
327-
}
328-
continue
329-
}
330-
for _, a := range v.Args {
331-
if s.values[a.ID].needSlot {
332-
live.add(a.ID)
333-
}
334-
}
335-
}
336321

337-
// for each predecessor of b, expand its list of live-at-end values
338-
// invariant: s contains the values live at the start of b (excluding phi inputs)
339-
for i, e := range b.Preds {
340-
p := e.b
341-
t.clear()
342-
t.addAll(s.live[p.ID])
343-
t.addAll(live.contents())
344-
t.addAll(spillLive[p.ID])
345-
for _, v := range phis {
346-
a := v.Args[i]
347-
if s.values[a.ID].needSlot {
348-
t.add(a.ID)
349-
}
350-
if spill := s.values[a.ID].spill; spill != nil {
322+
// Because values using stack slots are few and far inbetween
323+
// (compared to the set of all values), we use a path exploration
324+
// algorithm to calculate liveness here.
325+
f := s.f
326+
for _, b := range f.Blocks {
327+
for _, spillvid := range spillLive[b.ID] {
328+
val := &s.values[spillvid]
329+
val.addUseBlock(b, true)
330+
}
331+
for _, v := range b.Values {
332+
for i, a := range v.Args {
333+
val := &s.values[a.ID]
334+
useBlock := b
335+
forceLiveout := false
336+
if v.Op == OpPhi {
337+
useBlock = b.Preds[i].b
338+
forceLiveout = true
339+
if spill := val.spill; spill != nil {
351340
//TODO: remove? Subsumed by SpillUse?
352-
t.add(spill.ID)
341+
s.values[spill.ID].addUseBlock(useBlock, true)
353342
}
354343
}
355-
if t.size() == len(s.live[p.ID]) {
344+
if !val.needSlot {
356345
continue
357346
}
358-
// grow p's live set
359-
s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
360-
changed = true
347+
val.addUseBlock(useBlock, forceLiveout)
361348
}
362349
}
350+
}
363351

364-
if !changed {
365-
break
352+
s.live = make([][]ID, f.NumBlocks())
353+
push := func(bid, vid ID) {
354+
l := s.live[bid]
355+
if l == nil || l[len(l)-1] != vid {
356+
l = append(l, vid)
357+
s.live[bid] = l
366358
}
367359
}
360+
// TODO: If we can help along the interference graph by calculating livein sets,
361+
// we can do so trivially by turning this sparse set into an array of arrays
362+
// and checking the top for the current value instead of inclusion in the sparse set.
363+
seen := f.newSparseSet(f.NumBlocks())
364+
defer f.retSparseSet(seen)
365+
// instead of pruning out duplicate blocks when we build the useblocks slices
366+
// or when we add them to the queue, rely on the seen set to stop considering
367+
// them. This is slightly faster than building the workqueues as sets
368+
//
369+
// However, this means that the queue can grow larger than the number of blocks,
370+
// usually in very short functions. Returning a slice with values appended beyond the
371+
// original allocation can corrupt the allocator state, so cap the queue and return
372+
// the originally allocated slice regardless.
373+
allocedBqueue := f.Cache.allocBlockSlice(f.NumBlocks())
374+
defer f.Cache.freeBlockSlice(allocedBqueue)
375+
bqueue := allocedBqueue[:0:f.NumBlocks()]
376+
377+
for vid, v := range s.values {
378+
if !v.needSlot {
379+
continue
380+
}
381+
seen.clear()
382+
bqueue = bqueue[:0]
383+
for _, b := range v.useBlocks {
384+
if b.liveout {
385+
push(b.b.ID, ID(vid))
386+
}
387+
bqueue = append(bqueue, b.b)
388+
}
389+
for len(bqueue) > 0 {
390+
work := bqueue[len(bqueue)-1]
391+
bqueue = bqueue[:len(bqueue)-1]
392+
if seen.contains(work.ID) || work.ID == v.defBlock {
393+
continue
394+
}
395+
seen.add(work.ID)
396+
for _, e := range work.Preds {
397+
push(e.b.ID, ID(vid))
398+
bqueue = append(bqueue, e.b)
399+
}
400+
}
401+
}
402+
368403
if s.f.pass.debug > stackDebug {
369404
for _, b := range s.f.Blocks {
370405
fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])

0 commit comments

Comments
 (0)