@@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142
142
a .Index = i
143
143
}
144
144
145
- // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146
- // See runtime/mkduff.go.
147
- const (
148
- dzBlocks = 16 // number of MOV/ADD blocks
149
- dzBlockLen = 4 // number of clears per block
150
- dzBlockSize = 23 // size of instructions in a single block
151
- dzMovSize = 5 // size of single MOV instruction w/ offset
152
- dzLeaqSize = 4 // size of single LEAQ instruction
153
- dzClearStep = 16 // number of bytes cleared by each MOV instruction
154
- )
155
-
156
- func duffStart (size int64 ) int64 {
157
- x , _ := duff (size )
158
- return x
159
- }
160
- func duffAdj (size int64 ) int64 {
161
- _ , x := duff (size )
162
- return x
163
- }
164
-
165
- // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166
- // required to use the duffzero mechanism for a block of the given size.
167
- func duff (size int64 ) (int64 , int64 ) {
168
- if size < 32 || size > 1024 || size % dzClearStep != 0 {
169
- panic ("bad duffzero size" )
170
- }
171
- steps := size / dzClearStep
172
- blocks := steps / dzBlockLen
173
- steps %= dzBlockLen
174
- off := dzBlockSize * (dzBlocks - blocks )
175
- var adj int64
176
- if steps != 0 {
177
- off -= dzLeaqSize
178
- off -= dzMovSize * steps
179
- adj -= dzClearStep * (dzBlockLen - steps )
180
- }
181
- return off , adj
182
- }
183
-
184
145
func getgFromTLS (s * ssagen.State , r int16 ) {
185
146
// See the comments in cmd/internal/obj/x86/obj6.go
186
147
// near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
1104
1065
zero16 (off + n - 16 )
1105
1066
}
1106
1067
1107
- case ssa .OpAMD64DUFFCOPY :
1108
- p := s .Prog (obj .ADUFFCOPY )
1109
- p .To .Type = obj .TYPE_ADDR
1110
- p .To .Sym = ir .Syms .Duffcopy
1111
- if v .AuxInt % 16 != 0 {
1112
- v .Fatalf ("bad DUFFCOPY AuxInt %v" , v .AuxInt )
1068
+ case ssa .OpAMD64LoweredMove :
1069
+ dstReg := v .Args [0 ].Reg ()
1070
+ srcReg := v .Args [1 ].Reg ()
1071
+ if dstReg == srcReg {
1072
+ break
1073
+ }
1074
+ tmpReg := int16 (x86 .REG_X14 )
1075
+ n := v .AuxInt
1076
+ if n < 16 {
1077
+ v .Fatalf ("Move too small %d" , n )
1078
+ }
1079
+ // move 16 bytes from srcReg+off to dstReg+off.
1080
+ move16 := func (off int64 ) {
1081
+ move16 (s , srcReg , dstReg , tmpReg , off )
1082
+ }
1083
+
1084
+ // Generate copying instructions.
1085
+ var off int64
1086
+ for n >= 16 {
1087
+ move16 (off )
1088
+ off += 16
1089
+ n -= 16
1090
+ }
1091
+ if n != 0 {
1092
+ // use partially overlapped read/write.
1093
+ // TODO: use smaller operations when we can?
1094
+ move16 (off + n - 16 )
1095
+ }
1096
+
1097
+ case ssa .OpAMD64LoweredMoveLoop :
1098
+ dstReg := v .Args [0 ].Reg ()
1099
+ srcReg := v .Args [1 ].Reg ()
1100
+ if dstReg == srcReg {
1101
+ break
1102
+ }
1103
+ countReg := v .RegTmp ()
1104
+ tmpReg := int16 (x86 .REG_X14 )
1105
+ n := v .AuxInt
1106
+ loopSize := int64 (64 )
1107
+ if n < 3 * loopSize {
1108
+ // - a loop count of 0 won't work.
1109
+ // - a loop count of 1 is useless.
1110
+ // - a loop count of 2 is a code size ~tie
1111
+ // 4 instructions to implement the loop
1112
+ // 4 instructions in the loop body
1113
+ // vs
1114
+ // 8 instructions in the straightline code
1115
+ // Might as well use straightline code.
1116
+ v .Fatalf ("ZeroLoop size too small %d" , n )
1117
+ }
1118
+ // move 16 bytes from srcReg+off to dstReg+off.
1119
+ move16 := func (off int64 ) {
1120
+ move16 (s , srcReg , dstReg , tmpReg , off )
1121
+ }
1122
+
1123
+ // Put iteration count in a register.
1124
+ // MOVL $n, countReg
1125
+ p := s .Prog (x86 .AMOVL )
1126
+ p .From .Type = obj .TYPE_CONST
1127
+ p .From .Offset = n / loopSize
1128
+ p .To .Type = obj .TYPE_REG
1129
+ p .To .Reg = countReg
1130
+ cntInit := p
1131
+
1132
+ // Copy loopSize bytes starting at srcReg to dstReg.
1133
+ for i := range loopSize / 16 {
1134
+ move16 (i * 16 )
1135
+ }
1136
+ // ADDQ $loopSize, srcReg
1137
+ p = s .Prog (x86 .AADDQ )
1138
+ p .From .Type = obj .TYPE_CONST
1139
+ p .From .Offset = loopSize
1140
+ p .To .Type = obj .TYPE_REG
1141
+ p .To .Reg = srcReg
1142
+ // ADDQ $loopSize, dstReg
1143
+ p = s .Prog (x86 .AADDQ )
1144
+ p .From .Type = obj .TYPE_CONST
1145
+ p .From .Offset = loopSize
1146
+ p .To .Type = obj .TYPE_REG
1147
+ p .To .Reg = dstReg
1148
+ // DECL countReg
1149
+ p = s .Prog (x86 .ADECL )
1150
+ p .To .Type = obj .TYPE_REG
1151
+ p .To .Reg = countReg
1152
+ // Jump to loop header if we're not done yet.
1153
+ // JNE head
1154
+ p = s .Prog (x86 .AJNE )
1155
+ p .To .Type = obj .TYPE_BRANCH
1156
+ p .To .SetTarget (cntInit .Link )
1157
+
1158
+ // Multiples of the loop size are now done.
1159
+ n %= loopSize
1160
+
1161
+ // Copy any fractional portion.
1162
+ var off int64
1163
+ for n >= 16 {
1164
+ move16 (off )
1165
+ off += 16
1166
+ n -= 16
1167
+ }
1168
+ if n != 0 {
1169
+ // Use partially-overlapping copy.
1170
+ move16 (off + n - 16 )
1113
1171
}
1114
- p .To .Offset = 14 * (64 - v .AuxInt / 16 )
1115
- // 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116
- // MOVUPS (SI), X0
1117
- // ADDQ $16, SI
1118
- // MOVUPS X0, (DI)
1119
- // ADDQ $16, DI
1120
- // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
1121
1172
1122
1173
case ssa .OpCopy : // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
1123
1174
if v .Type .IsMemory () {
@@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) {
1709
1760
p .To .Reg = reg
1710
1761
p .To .Offset = off
1711
1762
}
1763
+
1764
+ // move 16 bytes from src+off to dst+off using temporary register tmp.
1765
+ func move16 (s * ssagen.State , src , dst , tmp int16 , off int64 ) {
1766
+ // MOVUPS off(srcReg), tmpReg
1767
+ // MOVUPS tmpReg, off(dstReg)
1768
+ p := s .Prog (x86 .AMOVUPS )
1769
+ p .From .Type = obj .TYPE_MEM
1770
+ p .From .Reg = src
1771
+ p .From .Offset = off
1772
+ p .To .Type = obj .TYPE_REG
1773
+ p .To .Reg = tmp
1774
+ p = s .Prog (x86 .AMOVUPS )
1775
+ p .From .Type = obj .TYPE_REG
1776
+ p .From .Reg = tmp
1777
+ p .To .Type = obj .TYPE_MEM
1778
+ p .To .Reg = dst
1779
+ p .To .Offset = off
1780
+ }
0 commit comments