Skip to content

Commit b10eb1d

Browse files
committed
cmd/compile: simplify zerorange on amd64
Get rid of duffzero and large zeroing cases. We only use this code for small things now. Change-Id: Idcf330d0ac6433448efa8e32be7eb7f988e10122 Reviewed-on: https://go-review.googlesource.com/c/go/+/678619 Reviewed-by: Jorropo <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent f8eae7a commit b10eb1d

File tree

4 files changed

+25
-101
lines changed

4 files changed

+25
-101
lines changed

src/cmd/compile/internal/amd64/ggen.go

Lines changed: 8 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -5,113 +5,23 @@
55
package amd64
66

77
import (
8-
"cmd/compile/internal/ir"
98
"cmd/compile/internal/objw"
10-
"cmd/compile/internal/types"
119
"cmd/internal/obj"
1210
"cmd/internal/obj/x86"
1311
)
1412

15-
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
16-
// See runtime/mkduff.go.
17-
const (
18-
dzBlocks = 16 // number of MOV/ADD blocks
19-
dzBlockLen = 4 // number of clears per block
20-
dzBlockSize = 23 // size of instructions in a single block
21-
dzMovSize = 5 // size of single MOV instruction w/ offset
22-
dzLeaqSize = 4 // size of single LEAQ instruction
23-
dzClearStep = 16 // number of bytes cleared by each MOV instruction
24-
25-
dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
26-
dzSize = dzBlocks * dzBlockSize
27-
)
28-
29-
// dzOff returns the offset for a jump into DUFFZERO.
30-
// b is the number of bytes to zero.
31-
func dzOff(b int64) int64 {
32-
off := int64(dzSize)
33-
off -= b / dzClearLen * dzBlockSize
34-
tailLen := b % dzClearLen
35-
if tailLen >= dzClearStep {
36-
off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
37-
}
38-
return off
39-
}
40-
41-
// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
42-
// b is the number of bytes to zero.
43-
func dzDI(b int64) int64 {
44-
tailLen := b % dzClearLen
45-
if tailLen < dzClearStep {
46-
return 0
47-
}
48-
tailSteps := tailLen / dzClearStep
49-
return -dzClearStep * (dzBlockLen - tailSteps)
50-
}
51-
5213
func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
53-
const (
54-
r13 = 1 << iota // if R13 is already zeroed.
55-
)
56-
57-
if cnt == 0 {
58-
return p
14+
if cnt%8 != 0 {
15+
panic("zeroed region not aligned")
5916
}
60-
61-
if cnt == 8 {
17+
for cnt >= 16 {
18+
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
19+
off += 16
20+
cnt -= 16
21+
}
22+
if cnt != 0 {
6223
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
63-
} else if cnt <= int64(8*types.RegSize) {
64-
for i := int64(0); i < cnt/16; i++ {
65-
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
66-
}
67-
68-
if cnt%16 != 0 {
69-
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
70-
}
71-
} else if cnt <= int64(128*types.RegSize) {
72-
// Save DI to r12. With the amd64 Go register abi, DI can contain
73-
// an incoming parameter, whereas R12 is always scratch.
74-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
75-
// Emit duffzero call
76-
p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
77-
p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
78-
p.To.Sym = ir.Syms.Duffzero
79-
if cnt%16 != 0 {
80-
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
81-
}
82-
// Restore DI from r12
83-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
84-
85-
} else {
86-
// When the register ABI is in effect, at this point in the
87-
// prolog we may have live values in all of RAX,RDI,RCX. Save
88-
// them off to registers before the REPSTOSQ below, then
89-
// restore. Note that R12 and R13 are always available as
90-
// scratch regs; here we also use R15 (this is safe to do
91-
// since there won't be any globals accessed in the prolog).
92-
// See rewriteToUseGot() in obj6.go for more on r15 use.
93-
94-
// Save rax/rdi/rcx
95-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
96-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
97-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
98-
99-
// Set up the REPSTOSQ and kick it off.
100-
p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
101-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
102-
p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
103-
p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
104-
p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
105-
106-
// Restore rax/rdi/rcx
107-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
108-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
109-
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
110-
111-
// Record the fact that r13 is no longer zero.
112-
*state &= ^uint32(r13)
11324
}
114-
11525
return p
11626
}
11727

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,15 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
144144

145145
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146146
// See runtime/mkduff.go.
147+
const (
148+
dzBlocks = 16 // number of MOV/ADD blocks
149+
dzBlockLen = 4 // number of clears per block
150+
dzBlockSize = 23 // size of instructions in a single block
151+
dzMovSize = 5 // size of single MOV instruction w/ offset
152+
dzLeaqSize = 4 // size of single LEAQ instruction
153+
dzClearStep = 16 // number of bytes cleared by each MOV instruction
154+
)
155+
147156
func duffStart(size int64) int64 {
148157
x, _ := duff(size)
149158
return x

src/cmd/compile/internal/liveness/plive.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,7 @@ func (lv *Liveness) epilogue() {
769769
// its stack copy is not live.
770770
continue
771771
}
772-
// Note: zeroing is handled by zeroResults in walk.go.
772+
// Note: zeroing is handled by zeroResults in ../ssagen/ssa.go.
773773
livedefer.Set(int32(i))
774774
}
775775
if n.IsOutputParamHeapAddr() {

src/cmd/compile/internal/ssagen/arch.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,13 @@ type ArchInfo struct {
2525

2626
PadFrame func(int64) int64
2727

28-
// ZeroRange zeroes a range of memory on stack. It is only inserted
29-
// at function entry, and it is ok to clobber registers.
28+
// ZeroRange zeroes a range of memory the on stack.
29+
// - it is only called at function entry
30+
// - it is ok to clobber (non-arg) registers.
31+
// - currently used only for small things, so it can be simple.
32+
// - pointers to heap-allocated return values
33+
// - open-coded deferred functions
34+
// (Max size in make.bash is 40 bytes.)
3035
ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog
3136

3237
Ginsnop func(*objw.Progs) *obj.Prog

0 commit comments

Comments
 (0)