Skip to content

Commit af0c4fe

Browse files
aclementsgopherbot
authored andcommitted
runtime: save scalar registers off stack in amd64 async preemption
Asynchronous preemption must save all registers that could be in use by Go code. Currently, it saves all of these to the goroutine stack. As a result, the stack frame requirements of asynchronous preemption can be rather high. On amd64, this requires 368 bytes of stack space, most of which is the XMM registers. Several RISC architectures are around 0.5 KiB. As we add support for SIMD instructions, this is going to become a problem. The AVX-512 register state is 2.5 KiB. This well exceeds the nosplit limit, and even if it didn't, could constrain when we can asynchronously preempt goroutines on small stacks. This CL fixes this by moving pure scalar state stored in non-GP registers off the stack and into an allocated "extended register state" object. To reduce space overhead, we only allocate these objects as needed. While in the theoretical limit, every G could need this register state, in practice very few do at a time. However, we can't allocate when we're in the middle of saving the register state during an asynchronous preemption, so we reserve scratch space on every P to temporarily store the register state, which can then be copied out to an allocated state object later by Go code. This commit only implements this for amd64, since that's where we're about to add much more vector state, but it lays the groundwork for doing this on any architecture that could benefit. This is a cherry-pick of CL 680898 plus bug fix CL 684836 from the dev.simd branch. Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735 Reviewed-on: https://go-review.googlesource.com/c/go/+/669195 Auto-Submit: Austin Clements <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent e73afaa commit af0c4fe

13 files changed

+387
-66
lines changed

src/runtime/export_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,8 @@ type G = g
554554

555555
type Sudog = sudog
556556

557+
type XRegPerG = xRegPerG
558+
557559
func Getg() *G {
558560
return getg()
559561
}

src/runtime/lockrank.go

Lines changed: 4 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/runtime/mheap.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,8 @@ func (h *mheap) init() {
821821
}
822822

823823
h.pages.init(&h.lock, &memstats.gcMiscSys, false)
824+
825+
xRegInitAlloc()
824826
}
825827

826828
// reclaim sweeps and reclaims at least npage pages into the heap.

src/runtime/mklockrank.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ defer,
193193
# Below WB is the write barrier implementation.
194194
< wbufSpans;
195195
196+
# xRegState allocator
197+
sched < xRegAlloc;
198+
196199
# Span allocator
197200
stackLarge,
198201
stackpool,
@@ -205,7 +208,8 @@ stackLarge,
205208
# an mspanSpecial lock, and they're part of the malloc implementation.
206209
# Pinner bits might be freed by the span allocator.
207210
mheap, mspanSpecial < mheapSpecial;
208-
mheap, mheapSpecial < globalAlloc;
211+
# Fixallocs
212+
mheap, mheapSpecial, xRegAlloc < globalAlloc;
209213
210214
# Execution tracer events (with a P)
211215
hchan,

src/runtime/mkpreempt.go

Lines changed: 81 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
package main
1010

1111
import (
12+
"bytes"
1213
"flag"
1314
"fmt"
15+
"go/format"
1416
"io"
1517
"log"
1618
"os"
@@ -122,14 +124,19 @@ type gen struct {
122124
goarch string
123125
}
124126

125-
func (g *gen) asmHeader() {
127+
func (g *gen) commonHeader() {
126128
fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
127129
if beLe[g.goarch] {
128130
base := g.goarch[:len(g.goarch)-1]
129131
fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
130132
}
133+
}
134+
135+
func (g *gen) asmHeader() {
136+
g.commonHeader()
131137
fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
132138
if g.goarch == "amd64" {
139+
fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
133140
fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
134141
}
135142
fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
@@ -145,14 +152,51 @@ func (g *gen) label(l string) {
145152
fmt.Fprintf(g.w, "%s\n", l)
146153
}
147154

155+
// writeXRegs writes an architecture xregs file.
156+
func writeXRegs(arch string, l *layout) {
157+
var code bytes.Buffer
158+
g := gen{&code, arch}
159+
g.commonHeader()
160+
fmt.Fprintf(g.w, `
161+
package runtime
162+
163+
type xRegs struct {
164+
`)
165+
pos := 0
166+
for _, reg := range l.regs {
167+
if reg.pos != pos {
168+
log.Fatalf("padding not implemented")
169+
}
170+
typ := fmt.Sprintf("[%d]byte", reg.size)
171+
switch {
172+
case reg.size == 4 && reg.pos%4 == 0:
173+
typ = "uint32"
174+
case reg.size == 8 && reg.pos%8 == 0:
175+
typ = "uint64"
176+
}
177+
fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
178+
pos += reg.size
179+
}
180+
fmt.Fprintf(g.w, "}\n")
181+
182+
path := fmt.Sprintf("preempt_%s.go", arch)
183+
b, err := format.Source(code.Bytes())
184+
if err != nil {
185+
log.Fatalf("formatting %s: %s", path, err)
186+
}
187+
if err := os.WriteFile(path, b, 0666); err != nil {
188+
log.Fatal(err)
189+
}
190+
}
191+
148192
type layout struct {
149193
stack int
150194
regs []regPos
151195
sp string // stack pointer register
152196
}
153197

154198
type regPos struct {
155-
pos int
199+
pos, size int
156200

157201
saveOp string
158202
restoreOp string
@@ -165,17 +209,17 @@ type regPos struct {
165209
}
166210

167211
func (l *layout) add(op, reg string, size int) {
168-
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
212+
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
169213
l.stack += size
170214
}
171215

172216
func (l *layout) add2(sop, rop, reg string, size int) {
173-
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
217+
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
174218
l.stack += size
175219
}
176220

177221
func (l *layout) addSpecial(save, restore string, size int) {
178-
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
222+
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
179223
l.stack += size
180224
}
181225

@@ -239,6 +283,8 @@ func gen386(g *gen) {
239283
}
240284

241285
func genAMD64(g *gen) {
286+
const xReg = "AX" // *xRegState
287+
242288
p := g.p
243289

244290
// Assign stack offsets.
@@ -251,12 +297,13 @@ func genAMD64(g *gen) {
251297
l.add("MOVQ", reg, 8)
252298
}
253299
}
254-
lSSE := layout{stack: l.stack, sp: "SP"}
300+
lXRegs := layout{sp: xReg} // Non-GP registers
255301
for _, reg := range regNamesAMD64 {
256302
if strings.HasPrefix(reg, "X") {
257-
lSSE.add("MOVUPS", reg, 16)
303+
lXRegs.add("MOVUPS", reg, 16)
258304
}
259305
}
306+
writeXRegs(g.goarch, &lXRegs)
260307

261308
// TODO: MXCSR register?
262309

@@ -265,17 +312,40 @@ func genAMD64(g *gen) {
265312
p("// Save flags before clobbering them")
266313
p("PUSHFQ")
267314
p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
268-
p("ADJSP $%d", lSSE.stack)
315+
p("ADJSP $%d", l.stack)
269316
p("// But vet doesn't know ADJSP, so suppress vet stack checking")
270317
p("NOP SP")
271318

319+
p("// Save GPs")
272320
l.save(g)
273321

274-
lSSE.save(g)
322+
// In general, the limitations on asynchronous preemption mean we only
323+
// preempt in ABIInternal code. However, there's at least one exception to
324+
// this: when we're in an open-coded transition between an ABIInternal
325+
// function and an ABI0 call. We could more carefully arrange unsafe points
326+
// to avoid ever landing in ABI0, but it's easy to just make this code not
327+
// sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
328+
// ensure we're in ABIInternal register state.
329+
p("// Save extended register state to p.xRegs.scratch")
330+
p("// Don't make assumptions about ABI register state. See mkpreempt.go")
331+
p("get_tls(CX)")
332+
p("MOVQ g(CX), R14")
333+
p("MOVQ g_m(R14), %s", xReg)
334+
p("MOVQ m_p(%s), %s", xReg, xReg)
335+
p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
336+
lXRegs.save(g)
337+
275338
p("CALL ·asyncPreempt2(SB)")
276-
lSSE.restore(g)
339+
340+
p("// Restore non-GPs from *p.xRegs.cache")
341+
p("MOVQ g_m(R14), %s", xReg)
342+
p("MOVQ m_p(%s), %s", xReg, xReg)
343+
p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
344+
lXRegs.restore(g)
345+
346+
p("// Restore GPs")
277347
l.restore(g)
278-
p("ADJSP $%d", -lSSE.stack)
348+
p("ADJSP $%d", -l.stack)
279349
p("POPFQ")
280350
p("POPQ BP")
281351
p("RET")

src/runtime/preempt.go

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -292,21 +292,52 @@ func canPreemptM(mp *m) bool {
292292

293293
// asyncPreempt saves all user registers and calls asyncPreempt2.
294294
//
295-
// When stack scanning encounters an asyncPreempt frame, it scans that
295+
// It saves GP registers (anything that might contain a pointer) to the G stack.
296+
// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
296297
// frame and its parent frame conservatively.
297298
//
299+
// On some platforms, it saves large additional scalar-only register state such
300+
// as vector registers to an "extended register state" on the P.
301+
//
298302
// asyncPreempt is implemented in assembly.
299303
func asyncPreempt()
300304

305+
// asyncPreempt2 is the Go continuation of asyncPreempt.
306+
//
307+
// It must be deeply nosplit because there's untyped data on the stack from
308+
// asyncPreempt.
309+
//
310+
// It must not have any write barriers because we need to limit the amount of
311+
// stack it uses.
312+
//
301313
//go:nosplit
314+
//go:nowritebarrierrec
302315
func asyncPreempt2() {
316+
// We can't grow the stack with untyped data from asyncPreempt, so switch to
317+
// the system stack right away.
318+
mcall(func(gp *g) {
319+
gp.asyncSafePoint = true
320+
321+
// Move the extended register state from the P to the G. We do this now that
322+
// we're on the system stack to avoid stack splits.
323+
xRegSave(gp)
324+
325+
if gp.preemptStop {
326+
preemptPark(gp)
327+
} else {
328+
gopreempt_m(gp)
329+
}
330+
// The above functions never return.
331+
})
332+
333+
// Do not grow the stack below here!
334+
303335
gp := getg()
304-
gp.asyncSafePoint = true
305-
if gp.preemptStop {
306-
mcall(preemptPark)
307-
} else {
308-
mcall(gopreempt_m)
309-
}
336+
337+
// Put the extended register state back on the M so resumption can find it.
338+
// We can't do this in asyncPreemptM because the park calls never return.
339+
xRegRestore(gp)
340+
310341
gp.asyncSafePoint = false
311342
}
312343

@@ -319,19 +350,13 @@ func init() {
319350
total := funcMaxSPDelta(f)
320351
f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
321352
total += funcMaxSPDelta(f)
353+
f = findfunc(abi.FuncPCABIInternal(xRegRestore))
354+
total += funcMaxSPDelta(f)
322355
// Add some overhead for return PCs, etc.
323356
asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
324357
if asyncPreemptStack > stackNosplit {
325-
// We need more than the nosplit limit. This isn't
326-
// unsafe, but it may limit asynchronous preemption.
327-
//
328-
// This may be a problem if we start using more
329-
// registers. In that case, we should store registers
330-
// in a context object. If we pre-allocate one per P,
331-
// asyncPreempt can spill just a few registers to the
332-
// stack, then grab its context object and spill into
333-
// it. When it enters the runtime, it would allocate a
334-
// new context for the P.
358+
// We need more than the nosplit limit. This isn't unsafe, but it may
359+
// limit asynchronous preemption. Consider moving state into xRegState.
335360
print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
336361
throw("async stack too large")
337362
}

src/runtime/preempt_amd64.go

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)