Skip to content

Commit ec9e117

Browse files
committed
cmd/compile: use generated loops instead of DUFFCOPY on amd64
goarch: amd64 cpu: 12th Gen Intel(R) Core(TM) i7-12700 │ base │ exp │ │ sec/op │ sec/op vs base │ MemmoveKnownSize112-20 1.764n ± 0% 1.247n ± 0% -29.31% (p=0.000 n=10) MemmoveKnownSize128-20 1.891n ± 0% 1.405n ± 1% -25.72% (p=0.000 n=10) MemmoveKnownSize192-20 2.521n ± 0% 2.114n ± 3% -16.16% (p=0.000 n=10) MemmoveKnownSize248-20 4.028n ± 0% 3.877n ± 1% -3.75% (p=0.000 n=10) MemmoveKnownSize256-20 3.272n ± 0% 2.961n ± 2% -9.53% (p=0.000 n=10) MemmoveKnownSize512-20 6.733n ± 3% 5.936n ± 4% -11.83% (p=0.000 n=10) MemmoveKnownSize1024-20 13.905n ± 5% 9.798n ± 9% -29.54% (p=0.000 n=10) Change-Id: Icc01cec0d8b072300d749a5ce76f53b3725b5c65 Reviewed-on: https://go-review.googlesource.com/c/go/+/678620 Reviewed-by: Jorropo <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Jakub Ciolek <[email protected]>
1 parent d0a64f7 commit ec9e117

File tree

8 files changed

+249
-217
lines changed

8 files changed

+249
-217
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 121 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
142142
a.Index = i
143143
}
144144

145-
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
146-
// See runtime/mkduff.go.
147-
const (
148-
dzBlocks = 16 // number of MOV/ADD blocks
149-
dzBlockLen = 4 // number of clears per block
150-
dzBlockSize = 23 // size of instructions in a single block
151-
dzMovSize = 5 // size of single MOV instruction w/ offset
152-
dzLeaqSize = 4 // size of single LEAQ instruction
153-
dzClearStep = 16 // number of bytes cleared by each MOV instruction
154-
)
155-
156-
func duffStart(size int64) int64 {
157-
x, _ := duff(size)
158-
return x
159-
}
160-
func duffAdj(size int64) int64 {
161-
_, x := duff(size)
162-
return x
163-
}
164-
165-
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
166-
// required to use the duffzero mechanism for a block of the given size.
167-
func duff(size int64) (int64, int64) {
168-
if size < 32 || size > 1024 || size%dzClearStep != 0 {
169-
panic("bad duffzero size")
170-
}
171-
steps := size / dzClearStep
172-
blocks := steps / dzBlockLen
173-
steps %= dzBlockLen
174-
off := dzBlockSize * (dzBlocks - blocks)
175-
var adj int64
176-
if steps != 0 {
177-
off -= dzLeaqSize
178-
off -= dzMovSize * steps
179-
adj -= dzClearStep * (dzBlockLen - steps)
180-
}
181-
return off, adj
182-
}
183-
184145
func getgFromTLS(s *ssagen.State, r int16) {
185146
// See the comments in cmd/internal/obj/x86/obj6.go
186147
// near CanUse1InsnTLS for a detailed explanation of these instructions.
@@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11041065
zero16(off + n - 16)
11051066
}
11061067

1107-
case ssa.OpAMD64DUFFCOPY:
1108-
p := s.Prog(obj.ADUFFCOPY)
1109-
p.To.Type = obj.TYPE_ADDR
1110-
p.To.Sym = ir.Syms.Duffcopy
1111-
if v.AuxInt%16 != 0 {
1112-
v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
1068+
case ssa.OpAMD64LoweredMove:
1069+
dstReg := v.Args[0].Reg()
1070+
srcReg := v.Args[1].Reg()
1071+
if dstReg == srcReg {
1072+
break
1073+
}
1074+
tmpReg := int16(x86.REG_X14)
1075+
n := v.AuxInt
1076+
if n < 16 {
1077+
v.Fatalf("Move too small %d", n)
1078+
}
1079+
// move 16 bytes from srcReg+off to dstReg+off.
1080+
move16 := func(off int64) {
1081+
move16(s, srcReg, dstReg, tmpReg, off)
1082+
}
1083+
1084+
// Generate copying instructions.
1085+
var off int64
1086+
for n >= 16 {
1087+
move16(off)
1088+
off += 16
1089+
n -= 16
1090+
}
1091+
if n != 0 {
1092+
// use partially overlapped read/write.
1093+
// TODO: use smaller operations when we can?
1094+
move16(off + n - 16)
1095+
}
1096+
1097+
case ssa.OpAMD64LoweredMoveLoop:
1098+
dstReg := v.Args[0].Reg()
1099+
srcReg := v.Args[1].Reg()
1100+
if dstReg == srcReg {
1101+
break
1102+
}
1103+
countReg := v.RegTmp()
1104+
tmpReg := int16(x86.REG_X14)
1105+
n := v.AuxInt
1106+
loopSize := int64(64)
1107+
if n < 3*loopSize {
1108+
// - a loop count of 0 won't work.
1109+
// - a loop count of 1 is useless.
1110+
// - a loop count of 2 is a code size ~tie
1111+
// 4 instructions to implement the loop
1112+
// 4 instructions in the loop body
1113+
// vs
1114+
// 8 instructions in the straightline code
1115+
// Might as well use straightline code.
1116+
v.Fatalf("ZeroLoop size too small %d", n)
1117+
}
1118+
// move 16 bytes from srcReg+off to dstReg+off.
1119+
move16 := func(off int64) {
1120+
move16(s, srcReg, dstReg, tmpReg, off)
1121+
}
1122+
1123+
// Put iteration count in a register.
1124+
// MOVL $n, countReg
1125+
p := s.Prog(x86.AMOVL)
1126+
p.From.Type = obj.TYPE_CONST
1127+
p.From.Offset = n / loopSize
1128+
p.To.Type = obj.TYPE_REG
1129+
p.To.Reg = countReg
1130+
cntInit := p
1131+
1132+
// Copy loopSize bytes starting at srcReg to dstReg.
1133+
for i := range loopSize / 16 {
1134+
move16(i * 16)
1135+
}
1136+
// ADDQ $loopSize, srcReg
1137+
p = s.Prog(x86.AADDQ)
1138+
p.From.Type = obj.TYPE_CONST
1139+
p.From.Offset = loopSize
1140+
p.To.Type = obj.TYPE_REG
1141+
p.To.Reg = srcReg
1142+
// ADDQ $loopSize, dstReg
1143+
p = s.Prog(x86.AADDQ)
1144+
p.From.Type = obj.TYPE_CONST
1145+
p.From.Offset = loopSize
1146+
p.To.Type = obj.TYPE_REG
1147+
p.To.Reg = dstReg
1148+
// DECL countReg
1149+
p = s.Prog(x86.ADECL)
1150+
p.To.Type = obj.TYPE_REG
1151+
p.To.Reg = countReg
1152+
// Jump to loop header if we're not done yet.
1153+
// JNE head
1154+
p = s.Prog(x86.AJNE)
1155+
p.To.Type = obj.TYPE_BRANCH
1156+
p.To.SetTarget(cntInit.Link)
1157+
1158+
// Multiples of the loop size are now done.
1159+
n %= loopSize
1160+
1161+
// Copy any fractional portion.
1162+
var off int64
1163+
for n >= 16 {
1164+
move16(off)
1165+
off += 16
1166+
n -= 16
1167+
}
1168+
if n != 0 {
1169+
// Use partially-overlapping copy.
1170+
move16(off + n - 16)
11131171
}
1114-
p.To.Offset = 14 * (64 - v.AuxInt/16)
1115-
// 14 and 64 are magic constants. 14 is the number of bytes to encode:
1116-
// MOVUPS (SI), X0
1117-
// ADDQ $16, SI
1118-
// MOVUPS X0, (DI)
1119-
// ADDQ $16, DI
1120-
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
11211172

11221173
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
11231174
if v.Type.IsMemory() {
@@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) {
17091760
p.To.Reg = reg
17101761
p.To.Offset = off
17111762
}
1763+
1764+
// move 16 bytes from src+off to dst+off using temporary register tmp.
1765+
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
1766+
// MOVUPS off(srcReg), tmpReg
1767+
// MOVUPS tmpReg, off(dstReg)
1768+
p := s.Prog(x86.AMOVUPS)
1769+
p.From.Type = obj.TYPE_MEM
1770+
p.From.Reg = src
1771+
p.From.Offset = off
1772+
p.To.Type = obj.TYPE_REG
1773+
p.To.Reg = tmp
1774+
p = s.Prog(x86.AMOVUPS)
1775+
p.From.Type = obj.TYPE_REG
1776+
p.From.Reg = tmp
1777+
p.To.Type = obj.TYPE_MEM
1778+
p.To.Reg = dst
1779+
p.To.Offset = off
1780+
}

src/cmd/compile/internal/ssa/_gen/AMD64.rules

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -264,24 +264,6 @@
264264
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
265265
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
266266

267-
(Move [32] dst src mem) =>
268-
(Move [16]
269-
(OffPtr <dst.Type> dst [16])
270-
(OffPtr <src.Type> src [16])
271-
(Move [16] dst src mem))
272-
273-
(Move [48] dst src mem) =>
274-
(Move [32]
275-
(OffPtr <dst.Type> dst [16])
276-
(OffPtr <src.Type> src [16])
277-
(Move [16] dst src mem))
278-
279-
(Move [64] dst src mem) =>
280-
(Move [32]
281-
(OffPtr <dst.Type> dst [32])
282-
(OffPtr <src.Type> src [32])
283-
(Move [32] dst src mem))
284-
285267
(Move [3] dst src mem) =>
286268
(MOVBstore [2] dst (MOVBload [2] src mem)
287269
(MOVWstore dst (MOVWload src mem) mem))
@@ -310,28 +292,19 @@
310292
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
311293
(MOVQstore dst (MOVQload src mem) mem))
312294

313-
// Adjust moves to be a multiple of 16 bytes.
314-
(Move [s] dst src mem)
315-
&& s > 16 && s%16 != 0 && s%16 <= 8 =>
316-
(Move [s-s%16]
317-
(OffPtr <dst.Type> dst [s%16])
318-
(OffPtr <src.Type> src [s%16])
319-
(MOVQstore dst (MOVQload src mem) mem))
320-
(Move [s] dst src mem)
321-
&& s > 16 && s%16 != 0 && s%16 > 8 =>
322-
(Move [s-s%16]
323-
(OffPtr <dst.Type> dst [s%16])
324-
(OffPtr <src.Type> src [s%16])
325-
(MOVOstore dst (MOVOload src mem) mem))
326-
327-
// Medium copying uses a duff device.
328-
(Move [s] dst src mem)
329-
&& s > 64 && s <= 16*64 && s%16 == 0
330-
&& logLargeCopy(v, s) =>
331-
(DUFFCOPY [s] dst src mem)
295+
// Copying up to 192 bytes uses straightline code.
296+
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
297+
298+
// Copying up to ~1KB uses a small loop.
299+
(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
332300

333301
// Large copying uses REP MOVSQ.
334-
(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
302+
(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 =>
303+
(Move [s-s%8]
304+
(OffPtr <dst.Type> dst [s%8])
305+
(OffPtr <src.Type> src [s%8])
306+
(MOVQstore dst (MOVQload src mem) mem))
307+
(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
335308
(REPMOVSQ dst src (MOVQconst [s/8]) mem)
336309

337310
// Lowering Zero instructions

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -939,20 +939,38 @@ func init() {
939939
// arg0 = destination pointer
940940
// arg1 = source pointer
941941
// arg2 = mem
942-
// auxint = # of bytes to copy, must be multiple of 16
942+
// auxint = # of bytes to copy
943943
// returns memory
944944
{
945-
name: "DUFFCOPY",
945+
name: "LoweredMove",
946946
aux: "Int64",
947947
argLength: 3,
948948
reg: regInfo{
949-
inputs: []regMask{buildReg("DI"), buildReg("SI")},
950-
clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
949+
inputs: []regMask{gp, gp},
950+
clobbers: buildReg("X14"), // uses X14 as a temporary
951951
},
952-
clobberFlags: true,
953-
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
954-
//faultOnNilArg1: true,
955-
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
952+
faultOnNilArg0: true,
953+
faultOnNilArg1: true,
954+
},
955+
// arg0 = destination pointer
956+
// arg1 = source pointer
957+
// arg2 = mem
958+
// auxint = # of bytes to copy
959+
// returns memory
960+
{
961+
name: "LoweredMoveLoop",
962+
aux: "Int64",
963+
argLength: 3,
964+
reg: regInfo{
965+
inputs: []regMask{gp, gp},
966+
clobbers: buildReg("X14"), // uses X14 as a temporary
967+
clobbersArg0: true,
968+
clobbersArg1: true,
969+
},
970+
clobberFlags: true,
971+
faultOnNilArg0: true,
972+
faultOnNilArg1: true,
973+
needIntTemp: true,
956974
},
957975

958976
// arg0 = destination pointer

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 28 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/regalloc.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,14 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
561561
pos = pos.WithNotStmt()
562562
// Check if v is already in a requested register.
563563
if mask&vi.regs != 0 {
564-
r := pickReg(mask & vi.regs)
564+
mask &= vi.regs
565+
r := pickReg(mask)
566+
if mask.contains(s.SPReg) {
567+
// Prefer the stack pointer if it is allowed.
568+
// (Needed because the op might have an Aux symbol
569+
// that needs SP as its base.)
570+
r = s.SPReg
571+
}
565572
if !s.allocatable.contains(r) {
566573
return v // v is in a fixed register
567574
}

0 commit comments

Comments
 (0)