|
5 | 5 | package amd64
|
6 | 6 |
|
7 | 7 | import (
|
8 |
| - "cmd/compile/internal/ir" |
9 | 8 | "cmd/compile/internal/objw"
|
10 |
| - "cmd/compile/internal/types" |
11 | 9 | "cmd/internal/obj"
|
12 | 10 | "cmd/internal/obj/x86"
|
13 | 11 | )
|
14 | 12 |
|
15 |
| -// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ, |
16 |
| -// See runtime/mkduff.go. |
17 |
| -const ( |
18 |
| - dzBlocks = 16 // number of MOV/ADD blocks |
19 |
| - dzBlockLen = 4 // number of clears per block |
20 |
| - dzBlockSize = 23 // size of instructions in a single block |
21 |
| - dzMovSize = 5 // size of single MOV instruction w/ offset |
22 |
| - dzLeaqSize = 4 // size of single LEAQ instruction |
23 |
| - dzClearStep = 16 // number of bytes cleared by each MOV instruction |
24 |
| - |
25 |
| - dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block |
26 |
| - dzSize = dzBlocks * dzBlockSize |
27 |
| -) |
28 |
| - |
29 |
| -// dzOff returns the offset for a jump into DUFFZERO. |
30 |
| -// b is the number of bytes to zero. |
31 |
| -func dzOff(b int64) int64 { |
32 |
| - off := int64(dzSize) |
33 |
| - off -= b / dzClearLen * dzBlockSize |
34 |
| - tailLen := b % dzClearLen |
35 |
| - if tailLen >= dzClearStep { |
36 |
| - off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep) |
37 |
| - } |
38 |
| - return off |
39 |
| -} |
40 |
| - |
41 |
| -// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO. |
42 |
| -// b is the number of bytes to zero. |
43 |
| -func dzDI(b int64) int64 { |
44 |
| - tailLen := b % dzClearLen |
45 |
| - if tailLen < dzClearStep { |
46 |
| - return 0 |
47 |
| - } |
48 |
| - tailSteps := tailLen / dzClearStep |
49 |
| - return -dzClearStep * (dzBlockLen - tailSteps) |
50 |
| -} |
51 |
| - |
52 | 13 | func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
|
53 |
| - const ( |
54 |
| - r13 = 1 << iota // if R13 is already zeroed. |
55 |
| - ) |
56 |
| - |
57 |
| - if cnt == 0 { |
58 |
| - return p |
| 14 | + if cnt%8 != 0 { |
| 15 | + panic("zeroed region not aligned") |
59 | 16 | }
|
60 |
| - |
61 |
| - if cnt == 8 { |
| 17 | + for cnt >= 16 { |
| 18 | + p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off) |
| 19 | + off += 16 |
| 20 | + cnt -= 16 |
| 21 | + } |
| 22 | + if cnt != 0 { |
62 | 23 | p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
|
63 |
| - } else if cnt <= int64(8*types.RegSize) { |
64 |
| - for i := int64(0); i < cnt/16; i++ { |
65 |
| - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16) |
66 |
| - } |
67 |
| - |
68 |
| - if cnt%16 != 0 { |
69 |
| - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16)) |
70 |
| - } |
71 |
| - } else if cnt <= int64(128*types.RegSize) { |
72 |
| - // Save DI to r12. With the amd64 Go register abi, DI can contain |
73 |
| - // an incoming parameter, whereas R12 is always scratch. |
74 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) |
75 |
| - // Emit duffzero call |
76 |
| - p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) |
77 |
| - p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) |
78 |
| - p.To.Sym = ir.Syms.Duffzero |
79 |
| - if cnt%16 != 0 { |
80 |
| - p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) |
81 |
| - } |
82 |
| - // Restore DI from r12 |
83 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) |
84 |
| - |
85 |
| - } else { |
86 |
| - // When the register ABI is in effect, at this point in the |
87 |
| - // prolog we may have live values in all of RAX,RDI,RCX. Save |
88 |
| - // them off to registers before the REPSTOSQ below, then |
89 |
| - // restore. Note that R12 and R13 are always available as |
90 |
| - // scratch regs; here we also use R15 (this is safe to do |
91 |
| - // since there won't be any globals accessed in the prolog). |
92 |
| - // See rewriteToUseGot() in obj6.go for more on r15 use. |
93 |
| - |
94 |
| - // Save rax/rdi/rcx |
95 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0) |
96 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0) |
97 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0) |
98 |
| - |
99 |
| - // Set up the REPSTOSQ and kick it off. |
100 |
| - p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0) |
101 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0) |
102 |
| - p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0) |
103 |
| - p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) |
104 |
| - p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) |
105 |
| - |
106 |
| - // Restore rax/rdi/rcx |
107 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0) |
108 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0) |
109 |
| - p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0) |
110 |
| - |
111 |
| - // Record the fact that r13 is no longer zero. |
112 |
| - *state &= ^uint32(r13) |
113 | 24 | }
|
114 |
| - |
115 | 25 | return p
|
116 | 26 | }
|
117 | 27 |
|
|
0 commit comments