@@ -142,6 +142,45 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142
142
a .Index = i
143
143
}
144
144
145
+ // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146
+ // See runtime/mkduff.go.
147
+ const (
148
+ dzBlocks = 16 // number of MOV/ADD blocks
149
+ dzBlockLen = 4 // number of clears per block
150
+ dzBlockSize = 23 // size of instructions in a single block
151
+ dzMovSize = 5 // size of single MOV instruction w/ offset
152
+ dzLeaqSize = 4 // size of single LEAQ instruction
153
+ dzClearStep = 16 // number of bytes cleared by each MOV instruction
154
+ )
155
+
156
+ func duffStart (size int64 ) int64 {
157
+ x , _ := duff (size )
158
+ return x
159
+ }
160
+ func duffAdj (size int64 ) int64 {
161
+ _ , x := duff (size )
162
+ return x
163
+ }
164
+
165
+ // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166
+ // required to use the duffzero mechanism for a block of the given size.
167
+ func duff (size int64 ) (int64 , int64 ) {
168
+ if size < 32 || size > 1024 || size % dzClearStep != 0 {
169
+ panic ("bad duffzero size" )
170
+ }
171
+ steps := size / dzClearStep
172
+ blocks := steps / dzBlockLen
173
+ steps %= dzBlockLen
174
+ off := dzBlockSize * (dzBlocks - blocks )
175
+ var adj int64
176
+ if steps != 0 {
177
+ off -= dzLeaqSize
178
+ off -= dzMovSize * steps
179
+ adj -= dzClearStep * (dzBlockLen - steps )
180
+ }
181
+ return off , adj
182
+ }
183
+
145
184
func getgFromTLS (s * ssagen.State , r int16 ) {
146
185
// See the comments in cmd/internal/obj/x86/obj6.go
147
186
// near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1065,110 +1104,20 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
1065
1104
zero16 (off + n - 16 )
1066
1105
}
1067
1106
1068
- case ssa .OpAMD64LoweredMove :
1069
- dstReg := v .Args [0 ].Reg ()
1070
- srcReg := v .Args [1 ].Reg ()
1071
- if dstReg == srcReg {
1072
- break
1073
- }
1074
- tmpReg := int16 (x86 .REG_X14 )
1075
- n := v .AuxInt
1076
- if n < 16 {
1077
- v .Fatalf ("Move too small %d" , n )
1078
- }
1079
- // move 16 bytes from srcReg+off to dstReg+off.
1080
- move16 := func (off int64 ) {
1081
- move16 (s , srcReg , dstReg , tmpReg , off )
1082
- }
1083
-
1084
- // Generate copying instructions.
1085
- var off int64
1086
- for n >= 16 {
1087
- move16 (off )
1088
- off += 16
1089
- n -= 16
1090
- }
1091
- if n != 0 {
1092
- // use partially overlapped read/write.
1093
- // TODO: use smaller operations when we can?
1094
- move16 (off + n - 16 )
1095
- }
1096
-
1097
- case ssa .OpAMD64LoweredMoveLoop :
1098
- dstReg := v .Args [0 ].Reg ()
1099
- srcReg := v .Args [1 ].Reg ()
1100
- if dstReg == srcReg {
1101
- break
1102
- }
1103
- countReg := v .RegTmp ()
1104
- tmpReg := int16 (x86 .REG_X14 )
1105
- n := v .AuxInt
1106
- loopSize := int64 (64 )
1107
- if n < 3 * loopSize {
1108
- // - a loop count of 0 won't work.
1109
- // - a loop count of 1 is useless.
1110
- // - a loop count of 2 is a code size ~tie
1111
- // 4 instructions to implement the loop
1112
- // 4 instructions in the loop body
1113
- // vs
1114
- // 8 instructions in the straightline code
1115
- // Might as well use straightline code.
1116
- v .Fatalf ("ZeroLoop size too small %d" , n )
1117
- }
1118
- // move 16 bytes from srcReg+off to dstReg+off.
1119
- move16 := func (off int64 ) {
1120
- move16 (s , srcReg , dstReg , tmpReg , off )
1121
- }
1122
-
1123
- // Put iteration count in a register.
1124
- // MOVL $n, countReg
1125
- p := s .Prog (x86 .AMOVL )
1126
- p .From .Type = obj .TYPE_CONST
1127
- p .From .Offset = n / loopSize
1128
- p .To .Type = obj .TYPE_REG
1129
- p .To .Reg = countReg
1130
- cntInit := p
1131
-
1132
- // Copy loopSize bytes starting at srcReg to dstReg.
1133
- for i := range loopSize / 16 {
1134
- move16 (i * 16 )
1135
- }
1136
- // ADDQ $loopSize, srcReg
1137
- p = s .Prog (x86 .AADDQ )
1138
- p .From .Type = obj .TYPE_CONST
1139
- p .From .Offset = loopSize
1140
- p .To .Type = obj .TYPE_REG
1141
- p .To .Reg = srcReg
1142
- // ADDQ $loopSize, dstReg
1143
- p = s .Prog (x86 .AADDQ )
1144
- p .From .Type = obj .TYPE_CONST
1145
- p .From .Offset = loopSize
1146
- p .To .Type = obj .TYPE_REG
1147
- p .To .Reg = dstReg
1148
- // DECL countReg
1149
- p = s .Prog (x86 .ADECL )
1150
- p .To .Type = obj .TYPE_REG
1151
- p .To .Reg = countReg
1152
- // Jump to loop header if we're not done yet.
1153
- // JNE head
1154
- p = s .Prog (x86 .AJNE )
1155
- p .To .Type = obj .TYPE_BRANCH
1156
- p .To .SetTarget (cntInit .Link )
1157
-
1158
- // Multiples of the loop size are now done.
1159
- n %= loopSize
1160
-
1161
- // Copy any fractional portion.
1162
- var off int64
1163
- for n >= 16 {
1164
- move16 (off )
1165
- off += 16
1166
- n -= 16
1167
- }
1168
- if n != 0 {
1169
- // Use partially-overlapping copy.
1170
- move16 (off + n - 16 )
1107
+ case ssa .OpAMD64DUFFCOPY :
1108
+ p := s .Prog (obj .ADUFFCOPY )
1109
+ p .To .Type = obj .TYPE_ADDR
1110
+ p .To .Sym = ir .Syms .Duffcopy
1111
+ if v .AuxInt % 16 != 0 {
1112
+ v .Fatalf ("bad DUFFCOPY AuxInt %v" , v .AuxInt )
1171
1113
}
1114
+ p .To .Offset = 14 * (64 - v .AuxInt / 16 )
1115
+ // 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116
+ // MOVUPS (SI), X0
1117
+ // ADDQ $16, SI
1118
+ // MOVUPS X0, (DI)
1119
+ // ADDQ $16, DI
1120
+ // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
1172
1121
1173
1122
case ssa .OpCopy : // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
1174
1123
if v .Type .IsMemory () {
@@ -1760,21 +1709,3 @@ func zero16(s *ssagen.State, reg int16, off int64) {
1760
1709
p .To .Reg = reg
1761
1710
p .To .Offset = off
1762
1711
}
1763
-
1764
- // move 16 bytes from src+off to dst+off using temporary register tmp.
1765
- func move16 (s * ssagen.State , src , dst , tmp int16 , off int64 ) {
1766
- // MOVUPS off(srcReg), tmpReg
1767
- // MOVUPS tmpReg, off(dstReg)
1768
- p := s .Prog (x86 .AMOVUPS )
1769
- p .From .Type = obj .TYPE_MEM
1770
- p .From .Reg = src
1771
- p .From .Offset = off
1772
- p .To .Type = obj .TYPE_REG
1773
- p .To .Reg = tmp
1774
- p = s .Prog (x86 .AMOVUPS )
1775
- p .From .Type = obj .TYPE_REG
1776
- p .From .Reg = tmp
1777
- p .To .Type = obj .TYPE_MEM
1778
- p .To .Reg = dst
1779
- p .To .Offset = off
1780
- }
0 commit comments