Skip to content

Commit 55d961b

Browse files
aclementsgopherbot
authored andcommitted
runtime: save AVX2 and AVX-512 state on asynchronous preemption
Based on CL 669415 by [email protected]. This is a cherry-pick of CL 680900 from the dev.simd branch. Change-Id: I574f15c3b18a7179a1573aaf567caf18d8602ef1 Reviewed-on: https://go-review.googlesource.com/c/go/+/693397 Reviewed-by: Cherry Mui <[email protected]> Auto-Submit: Austin Clements <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent af0c4fe commit 55d961b

File tree

4 files changed

+227
-54
lines changed

4 files changed

+227
-54
lines changed

src/runtime/cpuflags.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
const (
1414
offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX)
1515
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
16+
offsetX86HasAVX512 = unsafe.Offsetof(cpu.X86.HasAVX512) // F+CD+BW+DQ+VL
1617
offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
1718
offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP)
1819

src/runtime/mkpreempt.go

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ func gen386(g *gen) {
285285
func genAMD64(g *gen) {
286286
const xReg = "AX" // *xRegState
287287

288-
p := g.p
288+
p, label := g.p, g.label
289289

290290
// Assign stack offsets.
291291
var l = layout{sp: "SP"}
@@ -297,15 +297,33 @@ func genAMD64(g *gen) {
297297
l.add("MOVQ", reg, 8)
298298
}
299299
}
300-
lXRegs := layout{sp: xReg} // Non-GP registers
301-
for _, reg := range regNamesAMD64 {
302-
if strings.HasPrefix(reg, "X") {
303-
lXRegs.add("MOVUPS", reg, 16)
300+
// Create layouts for X, Y, and Z registers.
301+
const (
302+
numXRegs = 16
303+
numZRegs = 16 // TODO: If we start using upper registers, change to 32
304+
numKRegs = 8
305+
)
306+
lZRegs := layout{sp: xReg} // Non-GP registers
307+
lXRegs, lYRegs := lZRegs, lZRegs
308+
for i := range numZRegs {
309+
lZRegs.add("VMOVDQU64", fmt.Sprintf("Z%d", i), 512/8)
310+
if i < numXRegs {
311+
// Use SSE-only instructions for X registers.
312+
lXRegs.add("MOVUPS", fmt.Sprintf("X%d", i), 128/8)
313+
lYRegs.add("VMOVDQU", fmt.Sprintf("Y%d", i), 256/8)
304314
}
305315
}
306-
writeXRegs(g.goarch, &lXRegs)
307-
308-
// TODO: MXCSR register?
316+
for i := range numKRegs {
317+
lZRegs.add("KMOVQ", fmt.Sprintf("K%d", i), 8)
318+
}
319+
// The Z layout is the most general, so we line up the others with that one.
320+
// We don't have to do this, but it results in a nice Go type. If we split
321+
// this into multiple types, we probably should stop doing this.
322+
for i := range lXRegs.regs {
323+
lXRegs.regs[i].pos = lZRegs.regs[i].pos
324+
lYRegs.regs[i].pos = lZRegs.regs[i].pos
325+
}
326+
writeXRegs(g.goarch, &lZRegs)
309327

310328
p("PUSHQ BP")
311329
p("MOVQ SP, BP")
@@ -333,16 +351,56 @@ func genAMD64(g *gen) {
333351
p("MOVQ g_m(R14), %s", xReg)
334352
p("MOVQ m_p(%s), %s", xReg, xReg)
335353
p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
354+
355+
// Which registers do we need to save?
356+
p("#ifdef GOEXPERIMENT_simd")
357+
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
358+
p("JE saveAVX512")
359+
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
360+
p("JE saveAVX2")
361+
p("#endif")
362+
363+
// No features. Assume only SSE.
364+
label("saveSSE:")
336365
lXRegs.save(g)
366+
p("JMP preempt")
337367

368+
label("saveAVX2:")
369+
lYRegs.save(g)
370+
p("JMP preempt")
371+
372+
label("saveAVX512:")
373+
lZRegs.save(g)
374+
p("JMP preempt")
375+
376+
label("preempt:")
338377
p("CALL ·asyncPreempt2(SB)")
339378

340379
p("// Restore non-GPs from *p.xRegs.cache")
341380
p("MOVQ g_m(R14), %s", xReg)
342381
p("MOVQ m_p(%s), %s", xReg, xReg)
343382
p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
383+
384+
p("#ifdef GOEXPERIMENT_simd")
385+
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1")
386+
p("JE restoreAVX512")
387+
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1")
388+
p("JE restoreAVX2")
389+
p("#endif")
390+
391+
label("restoreSSE:")
344392
lXRegs.restore(g)
393+
p("JMP restoreGPs")
394+
395+
label("restoreAVX2:")
396+
lYRegs.restore(g)
397+
p("JMP restoreGPs")
398+
399+
label("restoreAVX512:")
400+
lZRegs.restore(g)
401+
p("JMP restoreGPs")
345402

403+
label("restoreGPs:")
346404
p("// Restore GPs")
347405
l.restore(g)
348406
p("ADJSP $%d", -l.stack)

src/runtime/preempt_amd64.go

Lines changed: 24 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/runtime/preempt_amd64.s

Lines changed: 136 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -36,43 +36,149 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
3636
MOVQ g_m(R14), AX
3737
MOVQ m_p(AX), AX
3838
LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
39+
#ifdef GOEXPERIMENT_simd
40+
CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1
41+
JE saveAVX512
42+
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
43+
JE saveAVX2
44+
#endif
45+
saveSSE:
3946
MOVUPS X0, 0(AX)
40-
MOVUPS X1, 16(AX)
41-
MOVUPS X2, 32(AX)
42-
MOVUPS X3, 48(AX)
43-
MOVUPS X4, 64(AX)
44-
MOVUPS X5, 80(AX)
45-
MOVUPS X6, 96(AX)
46-
MOVUPS X7, 112(AX)
47-
MOVUPS X8, 128(AX)
48-
MOVUPS X9, 144(AX)
49-
MOVUPS X10, 160(AX)
50-
MOVUPS X11, 176(AX)
51-
MOVUPS X12, 192(AX)
52-
MOVUPS X13, 208(AX)
53-
MOVUPS X14, 224(AX)
54-
MOVUPS X15, 240(AX)
47+
MOVUPS X1, 64(AX)
48+
MOVUPS X2, 128(AX)
49+
MOVUPS X3, 192(AX)
50+
MOVUPS X4, 256(AX)
51+
MOVUPS X5, 320(AX)
52+
MOVUPS X6, 384(AX)
53+
MOVUPS X7, 448(AX)
54+
MOVUPS X8, 512(AX)
55+
MOVUPS X9, 576(AX)
56+
MOVUPS X10, 640(AX)
57+
MOVUPS X11, 704(AX)
58+
MOVUPS X12, 768(AX)
59+
MOVUPS X13, 832(AX)
60+
MOVUPS X14, 896(AX)
61+
MOVUPS X15, 960(AX)
62+
JMP preempt
63+
saveAVX2:
64+
VMOVDQU Y0, 0(AX)
65+
VMOVDQU Y1, 64(AX)
66+
VMOVDQU Y2, 128(AX)
67+
VMOVDQU Y3, 192(AX)
68+
VMOVDQU Y4, 256(AX)
69+
VMOVDQU Y5, 320(AX)
70+
VMOVDQU Y6, 384(AX)
71+
VMOVDQU Y7, 448(AX)
72+
VMOVDQU Y8, 512(AX)
73+
VMOVDQU Y9, 576(AX)
74+
VMOVDQU Y10, 640(AX)
75+
VMOVDQU Y11, 704(AX)
76+
VMOVDQU Y12, 768(AX)
77+
VMOVDQU Y13, 832(AX)
78+
VMOVDQU Y14, 896(AX)
79+
VMOVDQU Y15, 960(AX)
80+
JMP preempt
81+
saveAVX512:
82+
VMOVDQU64 Z0, 0(AX)
83+
VMOVDQU64 Z1, 64(AX)
84+
VMOVDQU64 Z2, 128(AX)
85+
VMOVDQU64 Z3, 192(AX)
86+
VMOVDQU64 Z4, 256(AX)
87+
VMOVDQU64 Z5, 320(AX)
88+
VMOVDQU64 Z6, 384(AX)
89+
VMOVDQU64 Z7, 448(AX)
90+
VMOVDQU64 Z8, 512(AX)
91+
VMOVDQU64 Z9, 576(AX)
92+
VMOVDQU64 Z10, 640(AX)
93+
VMOVDQU64 Z11, 704(AX)
94+
VMOVDQU64 Z12, 768(AX)
95+
VMOVDQU64 Z13, 832(AX)
96+
VMOVDQU64 Z14, 896(AX)
97+
VMOVDQU64 Z15, 960(AX)
98+
KMOVQ K0, 1024(AX)
99+
KMOVQ K1, 1032(AX)
100+
KMOVQ K2, 1040(AX)
101+
KMOVQ K3, 1048(AX)
102+
KMOVQ K4, 1056(AX)
103+
KMOVQ K5, 1064(AX)
104+
KMOVQ K6, 1072(AX)
105+
KMOVQ K7, 1080(AX)
106+
JMP preempt
107+
preempt:
55108
CALL ·asyncPreempt2(SB)
56109
// Restore non-GPs from *p.xRegs.cache
57110
MOVQ g_m(R14), AX
58111
MOVQ m_p(AX), AX
59112
MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
60-
MOVUPS 240(AX), X15
61-
MOVUPS 224(AX), X14
62-
MOVUPS 208(AX), X13
63-
MOVUPS 192(AX), X12
64-
MOVUPS 176(AX), X11
65-
MOVUPS 160(AX), X10
66-
MOVUPS 144(AX), X9
67-
MOVUPS 128(AX), X8
68-
MOVUPS 112(AX), X7
69-
MOVUPS 96(AX), X6
70-
MOVUPS 80(AX), X5
71-
MOVUPS 64(AX), X4
72-
MOVUPS 48(AX), X3
73-
MOVUPS 32(AX), X2
74-
MOVUPS 16(AX), X1
113+
#ifdef GOEXPERIMENT_simd
114+
CMPB internal∕cpu·X86+const_offsetX86HasAVX512(SB), $1
115+
JE restoreAVX512
116+
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
117+
JE restoreAVX2
118+
#endif
119+
restoreSSE:
120+
MOVUPS 960(AX), X15
121+
MOVUPS 896(AX), X14
122+
MOVUPS 832(AX), X13
123+
MOVUPS 768(AX), X12
124+
MOVUPS 704(AX), X11
125+
MOVUPS 640(AX), X10
126+
MOVUPS 576(AX), X9
127+
MOVUPS 512(AX), X8
128+
MOVUPS 448(AX), X7
129+
MOVUPS 384(AX), X6
130+
MOVUPS 320(AX), X5
131+
MOVUPS 256(AX), X4
132+
MOVUPS 192(AX), X3
133+
MOVUPS 128(AX), X2
134+
MOVUPS 64(AX), X1
75135
MOVUPS 0(AX), X0
136+
JMP restoreGPs
137+
restoreAVX2:
138+
VMOVDQU 960(AX), Y15
139+
VMOVDQU 896(AX), Y14
140+
VMOVDQU 832(AX), Y13
141+
VMOVDQU 768(AX), Y12
142+
VMOVDQU 704(AX), Y11
143+
VMOVDQU 640(AX), Y10
144+
VMOVDQU 576(AX), Y9
145+
VMOVDQU 512(AX), Y8
146+
VMOVDQU 448(AX), Y7
147+
VMOVDQU 384(AX), Y6
148+
VMOVDQU 320(AX), Y5
149+
VMOVDQU 256(AX), Y4
150+
VMOVDQU 192(AX), Y3
151+
VMOVDQU 128(AX), Y2
152+
VMOVDQU 64(AX), Y1
153+
VMOVDQU 0(AX), Y0
154+
JMP restoreGPs
155+
restoreAVX512:
156+
KMOVQ 1080(AX), K7
157+
KMOVQ 1072(AX), K6
158+
KMOVQ 1064(AX), K5
159+
KMOVQ 1056(AX), K4
160+
KMOVQ 1048(AX), K3
161+
KMOVQ 1040(AX), K2
162+
KMOVQ 1032(AX), K1
163+
KMOVQ 1024(AX), K0
164+
VMOVDQU64 960(AX), Z15
165+
VMOVDQU64 896(AX), Z14
166+
VMOVDQU64 832(AX), Z13
167+
VMOVDQU64 768(AX), Z12
168+
VMOVDQU64 704(AX), Z11
169+
VMOVDQU64 640(AX), Z10
170+
VMOVDQU64 576(AX), Z9
171+
VMOVDQU64 512(AX), Z8
172+
VMOVDQU64 448(AX), Z7
173+
VMOVDQU64 384(AX), Z6
174+
VMOVDQU64 320(AX), Z5
175+
VMOVDQU64 256(AX), Z4
176+
VMOVDQU64 192(AX), Z3
177+
VMOVDQU64 128(AX), Z2
178+
VMOVDQU64 64(AX), Z1
179+
VMOVDQU64 0(AX), Z0
180+
JMP restoreGPs
181+
restoreGPs:
76182
// Restore GPs
77183
MOVQ 104(SP), R15
78184
MOVQ 96(SP), R14

0 commit comments

Comments
 (0)