Skip to content

Commit 4e182db

Browse files
randall77gopherbot
authored andcommitted
Revert "cmd/compile: use generated loops instead of DUFFCOPY on amd64"
This reverts commit ec9e117 (CL 678620). Reason for revert: causing regalloc to get into an infinite loop Change-Id: Ie53c58c6126804af6d6883ea4acdcfb632a172bd Reviewed-on: https://go-review.googlesource.com/c/go/+/695196 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Auto-Submit: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]>
1 parent d2b3c1a commit 4e182db

File tree

8 files changed

+217
-249
lines changed

8 files changed

+217
-249
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 52 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,45 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142142
a.Index = i
143143
}
144144

145+
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146+
// See runtime/mkduff.go.
147+
const (
148+
dzBlocks = 16 // number of MOV/ADD blocks
149+
dzBlockLen = 4 // number of clears per block
150+
dzBlockSize = 23 // size of instructions in a single block
151+
dzMovSize = 5 // size of single MOV instruction w/ offset
152+
dzLeaqSize = 4 // size of single LEAQ instruction
153+
dzClearStep = 16 // number of bytes cleared by each MOV instruction
154+
)
155+
156+
func duffStart(size int64) int64 {
157+
x, _ := duff(size)
158+
return x
159+
}
160+
func duffAdj(size int64) int64 {
161+
_, x := duff(size)
162+
return x
163+
}
164+
165+
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166+
// required to use the duffzero mechanism for a block of the given size.
167+
func duff(size int64) (int64, int64) {
168+
if size < 32 || size > 1024 || size%dzClearStep != 0 {
169+
panic("bad duffzero size")
170+
}
171+
steps := size / dzClearStep
172+
blocks := steps / dzBlockLen
173+
steps %= dzBlockLen
174+
off := dzBlockSize * (dzBlocks - blocks)
175+
var adj int64
176+
if steps != 0 {
177+
off -= dzLeaqSize
178+
off -= dzMovSize * steps
179+
adj -= dzClearStep * (dzBlockLen - steps)
180+
}
181+
return off, adj
182+
}
183+
145184
func getgFromTLS(s *ssagen.State, r int16) {
146185
// See the comments in cmd/internal/obj/x86/obj6.go
147186
// near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1065,110 +1104,20 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
10651104
zero16(off + n - 16)
10661105
}
10671106

1068-
case ssa.OpAMD64LoweredMove:
1069-
dstReg := v.Args[0].Reg()
1070-
srcReg := v.Args[1].Reg()
1071-
if dstReg == srcReg {
1072-
break
1073-
}
1074-
tmpReg := int16(x86.REG_X14)
1075-
n := v.AuxInt
1076-
if n < 16 {
1077-
v.Fatalf("Move too small %d", n)
1078-
}
1079-
// move 16 bytes from srcReg+off to dstReg+off.
1080-
move16 := func(off int64) {
1081-
move16(s, srcReg, dstReg, tmpReg, off)
1082-
}
1083-
1084-
// Generate copying instructions.
1085-
var off int64
1086-
for n >= 16 {
1087-
move16(off)
1088-
off += 16
1089-
n -= 16
1090-
}
1091-
if n != 0 {
1092-
// use partially overlapped read/write.
1093-
// TODO: use smaller operations when we can?
1094-
move16(off + n - 16)
1095-
}
1096-
1097-
case ssa.OpAMD64LoweredMoveLoop:
1098-
dstReg := v.Args[0].Reg()
1099-
srcReg := v.Args[1].Reg()
1100-
if dstReg == srcReg {
1101-
break
1102-
}
1103-
countReg := v.RegTmp()
1104-
tmpReg := int16(x86.REG_X14)
1105-
n := v.AuxInt
1106-
loopSize := int64(64)
1107-
if n < 3*loopSize {
1108-
// - a loop count of 0 won't work.
1109-
// - a loop count of 1 is useless.
1110-
// - a loop count of 2 is a code size ~tie
1111-
// 4 instructions to implement the loop
1112-
// 4 instructions in the loop body
1113-
// vs
1114-
// 8 instructions in the straightline code
1115-
// Might as well use straightline code.
1116-
v.Fatalf("ZeroLoop size too small %d", n)
1117-
}
1118-
// move 16 bytes from srcReg+off to dstReg+off.
1119-
move16 := func(off int64) {
1120-
move16(s, srcReg, dstReg, tmpReg, off)
1121-
}
1122-
1123-
// Put iteration count in a register.
1124-
// MOVL $n, countReg
1125-
p := s.Prog(x86.AMOVL)
1126-
p.From.Type = obj.TYPE_CONST
1127-
p.From.Offset = n / loopSize
1128-
p.To.Type = obj.TYPE_REG
1129-
p.To.Reg = countReg
1130-
cntInit := p
1131-
1132-
// Copy loopSize bytes starting at srcReg to dstReg.
1133-
for i := range loopSize / 16 {
1134-
move16(i * 16)
1135-
}
1136-
// ADDQ $loopSize, srcReg
1137-
p = s.Prog(x86.AADDQ)
1138-
p.From.Type = obj.TYPE_CONST
1139-
p.From.Offset = loopSize
1140-
p.To.Type = obj.TYPE_REG
1141-
p.To.Reg = srcReg
1142-
// ADDQ $loopSize, dstReg
1143-
p = s.Prog(x86.AADDQ)
1144-
p.From.Type = obj.TYPE_CONST
1145-
p.From.Offset = loopSize
1146-
p.To.Type = obj.TYPE_REG
1147-
p.To.Reg = dstReg
1148-
// DECL countReg
1149-
p = s.Prog(x86.ADECL)
1150-
p.To.Type = obj.TYPE_REG
1151-
p.To.Reg = countReg
1152-
// Jump to loop header if we're not done yet.
1153-
// JNE head
1154-
p = s.Prog(x86.AJNE)
1155-
p.To.Type = obj.TYPE_BRANCH
1156-
p.To.SetTarget(cntInit.Link)
1157-
1158-
// Multiples of the loop size are now done.
1159-
n %= loopSize
1160-
1161-
// Copy any fractional portion.
1162-
var off int64
1163-
for n >= 16 {
1164-
move16(off)
1165-
off += 16
1166-
n -= 16
1167-
}
1168-
if n != 0 {
1169-
// Use partially-overlapping copy.
1170-
move16(off + n - 16)
1107+
case ssa.OpAMD64DUFFCOPY:
1108+
p := s.Prog(obj.ADUFFCOPY)
1109+
p.To.Type = obj.TYPE_ADDR
1110+
p.To.Sym = ir.Syms.Duffcopy
1111+
if v.AuxInt%16 != 0 {
1112+
v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
11711113
}
1114+
p.To.Offset = 14 * (64 - v.AuxInt/16)
1115+
// 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116+
// MOVUPS (SI), X0
1117+
// ADDQ $16, SI
1118+
// MOVUPS X0, (DI)
1119+
// ADDQ $16, DI
1120+
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
11721121

11731122
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
11741123
if v.Type.IsMemory() {
@@ -1760,21 +1709,3 @@ func zero16(s *ssagen.State, reg int16, off int64) {
17601709
p.To.Reg = reg
17611710
p.To.Offset = off
17621711
}
1763-
1764-
// move 16 bytes from src+off to dst+off using temporary register tmp.
1765-
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
1766-
// MOVUPS off(srcReg), tmpReg
1767-
// MOVUPS tmpReg, off(dstReg)
1768-
p := s.Prog(x86.AMOVUPS)
1769-
p.From.Type = obj.TYPE_MEM
1770-
p.From.Reg = src
1771-
p.From.Offset = off
1772-
p.To.Type = obj.TYPE_REG
1773-
p.To.Reg = tmp
1774-
p = s.Prog(x86.AMOVUPS)
1775-
p.From.Type = obj.TYPE_REG
1776-
p.From.Reg = tmp
1777-
p.To.Type = obj.TYPE_MEM
1778-
p.To.Reg = dst
1779-
p.To.Offset = off
1780-
}

src/cmd/compile/internal/ssa/_gen/AMD64.rules

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,24 @@
264264
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
265265
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
266266

267+
(Move [32] dst src mem) =>
268+
(Move [16]
269+
(OffPtr <dst.Type> dst [16])
270+
(OffPtr <src.Type> src [16])
271+
(Move [16] dst src mem))
272+
273+
(Move [48] dst src mem) =>
274+
(Move [32]
275+
(OffPtr <dst.Type> dst [16])
276+
(OffPtr <src.Type> src [16])
277+
(Move [16] dst src mem))
278+
279+
(Move [64] dst src mem) =>
280+
(Move [32]
281+
(OffPtr <dst.Type> dst [32])
282+
(OffPtr <src.Type> src [32])
283+
(Move [32] dst src mem))
284+
267285
(Move [3] dst src mem) =>
268286
(MOVBstore [2] dst (MOVBload [2] src mem)
269287
(MOVWstore dst (MOVWload src mem) mem))
@@ -292,19 +310,28 @@
292310
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
293311
(MOVQstore dst (MOVQload src mem) mem))
294312

295-
// Copying up to 192 bytes uses straightline code.
296-
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
297-
298-
// Copying up to ~1KB uses a small loop.
299-
(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
313+
// Adjust moves to be a multiple of 16 bytes.
314+
(Move [s] dst src mem)
315+
&& s > 16 && s%16 != 0 && s%16 <= 8 =>
316+
(Move [s-s%16]
317+
(OffPtr <dst.Type> dst [s%16])
318+
(OffPtr <src.Type> src [s%16])
319+
(MOVQstore dst (MOVQload src mem) mem))
320+
(Move [s] dst src mem)
321+
&& s > 16 && s%16 != 0 && s%16 > 8 =>
322+
(Move [s-s%16]
323+
(OffPtr <dst.Type> dst [s%16])
324+
(OffPtr <src.Type> src [s%16])
325+
(MOVOstore dst (MOVOload src mem) mem))
326+
327+
// Medium copying uses a duff device.
328+
(Move [s] dst src mem)
329+
&& s > 64 && s <= 16*64 && s%16 == 0
330+
&& logLargeCopy(v, s) =>
331+
(DUFFCOPY [s] dst src mem)
300332

301333
// Large copying uses REP MOVSQ.
302-
(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 =>
303-
(Move [s-s%8]
304-
(OffPtr <dst.Type> dst [s%8])
305-
(OffPtr <src.Type> src [s%8])
306-
(MOVQstore dst (MOVQload src mem) mem))
307-
(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
334+
(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
308335
(REPMOVSQ dst src (MOVQconst [s/8]) mem)
309336

310337
// Lowering Zero instructions

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -939,38 +939,20 @@ func init() {
939939
// arg0 = destination pointer
940940
// arg1 = source pointer
941941
// arg2 = mem
942-
// auxint = # of bytes to copy
942+
// auxint = # of bytes to copy, must be multiple of 16
943943
// returns memory
944944
{
945-
name: "LoweredMove",
945+
name: "DUFFCOPY",
946946
aux: "Int64",
947947
argLength: 3,
948948
reg: regInfo{
949-
inputs: []regMask{gp, gp},
950-
clobbers: buildReg("X14"), // uses X14 as a temporary
949+
inputs: []regMask{buildReg("DI"), buildReg("SI")},
950+
clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
951951
},
952-
faultOnNilArg0: true,
953-
faultOnNilArg1: true,
954-
},
955-
// arg0 = destination pointer
956-
// arg1 = source pointer
957-
// arg2 = mem
958-
// auxint = # of bytes to copy
959-
// returns memory
960-
{
961-
name: "LoweredMoveLoop",
962-
aux: "Int64",
963-
argLength: 3,
964-
reg: regInfo{
965-
inputs: []regMask{gp, gp},
966-
clobbers: buildReg("X14"), // uses X14 as a temporary
967-
clobbersArg0: true,
968-
clobbersArg1: true,
969-
},
970-
clobberFlags: true,
971-
faultOnNilArg0: true,
972-
faultOnNilArg1: true,
973-
needIntTemp: true,
952+
clobberFlags: true,
953+
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
954+
//faultOnNilArg1: true,
955+
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
974956
},
975957

976958
// arg0 = destination pointer

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 9 additions & 28 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/regalloc.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -561,14 +561,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
561561
pos = pos.WithNotStmt()
562562
// Check if v is already in a requested register.
563563
if mask&vi.regs != 0 {
564-
mask &= vi.regs
565-
r := pickReg(mask)
566-
if mask.contains(s.SPReg) {
567-
// Prefer the stack pointer if it is allowed.
568-
// (Needed because the op might have an Aux symbol
569-
// that needs SP as its base.)
570-
r = s.SPReg
571-
}
564+
r := pickReg(mask & vi.regs)
572565
if !s.allocatable.contains(r) {
573566
return v // v is in a fixed register
574567
}

0 commit comments

Comments
 (0)