Skip to content

Commit eb7f515

Browse files
committed
cmd/compile: use generated loops instead of DUFFZERO on amd64
goarch: amd64 cpu: 12th Gen Intel(R) Core(TM) i7-12700 │ base │ exp │ │ sec/op │ sec/op vs base │ MemclrKnownSize112-20 1.270n ± 14% 1.006n ± 0% -20.72% (p=0.000 n=10) MemclrKnownSize128-20 1.266n ± 0% 1.005n ± 0% -20.58% (p=0.000 n=10) MemclrKnownSize192-20 1.771n ± 0% 1.579n ± 1% -10.84% (p=0.000 n=10) MemclrKnownSize248-20 4.034n ± 0% 3.520n ± 0% -12.75% (p=0.000 n=10) MemclrKnownSize256-20 2.269n ± 0% 2.014n ± 0% -11.26% (p=0.000 n=10) MemclrKnownSize512-20 4.280n ± 0% 4.030n ± 0% -5.84% (p=0.000 n=10) MemclrKnownSize1024-20 8.309n ± 1% 8.057n ± 0% -3.03% (p=0.000 n=10) Change-Id: I8f1627e2a1e981ff351dc7178932b32a2627f765 Reviewed-on: https://go-review.googlesource.com/c/go/+/678937 Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent c0ee2fd commit eb7f515

File tree

8 files changed

+216
-141
lines changed

8 files changed

+216
-141
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 103 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,26 +1007,103 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
10071007
ssagen.AddAux(&p.From, v)
10081008
p.To.Type = obj.TYPE_REG
10091009
p.To.Reg = v.Reg()
1010-
case ssa.OpAMD64DUFFZERO:
1010+
1011+
case ssa.OpAMD64LoweredZero:
10111012
if s.ABI != obj.ABIInternal {
10121013
// zero X15 manually
10131014
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
10141015
}
1015-
off := duffStart(v.AuxInt)
1016-
adj := duffAdj(v.AuxInt)
1017-
var p *obj.Prog
1018-
if adj != 0 {
1019-
p = s.Prog(x86.ALEAQ)
1020-
p.From.Type = obj.TYPE_MEM
1021-
p.From.Offset = adj
1022-
p.From.Reg = x86.REG_DI
1023-
p.To.Type = obj.TYPE_REG
1024-
p.To.Reg = x86.REG_DI
1016+
ptrReg := v.Args[0].Reg()
1017+
n := v.AuxInt
1018+
if n < 16 {
1019+
v.Fatalf("Zero too small %d", n)
10251020
}
1026-
p = s.Prog(obj.ADUFFZERO)
1027-
p.To.Type = obj.TYPE_ADDR
1028-
p.To.Sym = ir.Syms.Duffzero
1029-
p.To.Offset = off
1021+
zero16 := func(off int64) {
1022+
zero16(s, ptrReg, off)
1023+
}
1024+
1025+
// Generate zeroing instructions.
1026+
var off int64
1027+
for n >= 16 {
1028+
zero16(off)
1029+
off += 16
1030+
n -= 16
1031+
}
1032+
if n != 0 {
1033+
// use partially overlapped write.
1034+
// TODO: n <= 8, use smaller write?
1035+
zero16(off + n - 16)
1036+
}
1037+
1038+
case ssa.OpAMD64LoweredZeroLoop:
1039+
if s.ABI != obj.ABIInternal {
1040+
// zero X15 manually
1041+
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
1042+
}
1043+
ptrReg := v.Args[0].Reg()
1044+
countReg := v.RegTmp()
1045+
n := v.AuxInt
1046+
loopSize := int64(64)
1047+
if n < 3*loopSize {
1048+
// - a loop count of 0 won't work.
1049+
// - a loop count of 1 is useless.
1050+
// - a loop count of 2 is a code size ~tie
1051+
// 4 instructions to implement the loop
1052+
// 4 instructions in the loop body
1053+
// vs
1054+
// 8 instructions in the straightline code
1055+
// Might as well use straightline code.
1056+
v.Fatalf("ZeroLoop size too small %d", n)
1057+
}
1058+
zero16 := func(off int64) {
1059+
zero16(s, ptrReg, off)
1060+
}
1061+
1062+
// Put iteration count in a register.
1063+
// MOVL $n, countReg
1064+
p := s.Prog(x86.AMOVL)
1065+
p.From.Type = obj.TYPE_CONST
1066+
p.From.Offset = n / loopSize
1067+
p.To.Type = obj.TYPE_REG
1068+
p.To.Reg = countReg
1069+
cntInit := p
1070+
1071+
// Zero loopSize bytes starting at ptrReg.
1072+
for i := range loopSize / 16 {
1073+
zero16(i * 16)
1074+
}
1075+
// ADDQ $loopSize, ptrReg
1076+
p = s.Prog(x86.AADDQ)
1077+
p.From.Type = obj.TYPE_CONST
1078+
p.From.Offset = loopSize
1079+
p.To.Type = obj.TYPE_REG
1080+
p.To.Reg = ptrReg
1081+
// DECL countReg
1082+
p = s.Prog(x86.ADECL)
1083+
p.To.Type = obj.TYPE_REG
1084+
p.To.Reg = countReg
1085+
// Jump to first instruction in loop if we're not done yet.
1086+
// JNE head
1087+
p = s.Prog(x86.AJNE)
1088+
p.To.Type = obj.TYPE_BRANCH
1089+
p.To.SetTarget(cntInit.Link)
1090+
1091+
// Multiples of the loop size are now done.
1092+
n %= loopSize
1093+
1094+
// Write any fractional portion.
1095+
var off int64
1096+
for n >= 16 {
1097+
zero16(off)
1098+
off += 16
1099+
n -= 16
1100+
}
1101+
if n != 0 {
1102+
// Use partially-overlapping write.
1103+
// TODO: n <= 8, use smaller write?
1104+
zero16(off + n - 16)
1105+
}
1106+
10301107
case ssa.OpAMD64DUFFCOPY:
10311108
p := s.Prog(obj.ADUFFCOPY)
10321109
p.To.Type = obj.TYPE_ADDR
@@ -1621,3 +1698,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
16211698
p.Pos = p.Pos.WithNotStmt()
16221699
return p
16231700
}
1701+
1702+
// zero 16 bytes at reg+off.
1703+
func zero16(s *ssagen.State, reg int16, off int64) {
1704+
// MOVUPS X15, off(ptrReg)
1705+
p := s.Prog(x86.AMOVUPS)
1706+
p.From.Type = obj.TYPE_REG
1707+
p.From.Reg = x86.REG_X15
1708+
p.To.Type = obj.TYPE_MEM
1709+
p.To.Reg = reg
1710+
p.To.Offset = off
1711+
}

src/cmd/compile/internal/ssa/_gen/AMD64.rules

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -375,34 +375,17 @@
375375
(MOVQstoreconst [makeValAndOff(0,int32(s-8))] destptr
376376
(MOVQstoreconst [makeValAndOff(0,0)] destptr mem))
377377

378-
// Adjust zeros to be a multiple of 16 bytes.
379-
(Zero [s] destptr mem) && s%16 != 0 && s > 16 =>
380-
(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
381-
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
382-
383-
(Zero [16] destptr mem) =>
384-
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem)
385-
(Zero [32] destptr mem) =>
386-
(MOVOstoreconst [makeValAndOff(0,16)] destptr
387-
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
388-
(Zero [48] destptr mem) =>
389-
(MOVOstoreconst [makeValAndOff(0,32)] destptr
390-
(MOVOstoreconst [makeValAndOff(0,16)] destptr
391-
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem)))
392-
(Zero [64] destptr mem) =>
393-
(MOVOstoreconst [makeValAndOff(0,48)] destptr
394-
(MOVOstoreconst [makeValAndOff(0,32)] destptr
395-
(MOVOstoreconst [makeValAndOff(0,16)] destptr
396-
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))))
397-
398-
// Medium zeroing uses a duff device.
399-
(Zero [s] destptr mem)
400-
&& s > 64 && s <= 1024 && s%16 == 0 =>
401-
(DUFFZERO [s] destptr mem)
378+
// Zeroing up to 192 bytes uses straightline code.
379+
(Zero [s] destptr mem) && s >= 16 && s < 192 => (LoweredZero [s] destptr mem)
380+
381+
// Zeroing up to ~1KB uses a small loop.
382+
(Zero [s] destptr mem) && s >= 192 && s <= repZeroThreshold => (LoweredZeroLoop [s] destptr mem)
402383

403384
// Large zeroing uses REP STOSQ.
404-
(Zero [s] destptr mem)
405-
&& s > 1024 && s%8 == 0 =>
385+
(Zero [s] destptr mem) && s > repZeroThreshold && s%8 != 0 =>
386+
(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
387+
(MOVOstoreconst [makeValAndOff(0,0)] destptr mem))
388+
(Zero [s] destptr mem) && s > repZeroThreshold && s%8 == 0 =>
406389
(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)
407390

408391
// Lowering constants

src/cmd/compile/internal/ssa/_gen/AMD64Ops.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -889,15 +889,30 @@ func init() {
889889
// auxint = # of bytes to zero
890890
// returns mem
891891
{
892-
name: "DUFFZERO",
892+
name: "LoweredZero",
893893
aux: "Int64",
894894
argLength: 2,
895895
reg: regInfo{
896-
inputs: []regMask{buildReg("DI")},
897-
clobbers: buildReg("DI"),
896+
inputs: []regMask{gp},
898897
},
899-
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
900-
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
898+
faultOnNilArg0: true,
899+
},
900+
901+
// arg0 = pointer to start of memory to zero
902+
// arg1 = mem
903+
// auxint = # of bytes to zero
904+
// returns mem
905+
{
906+
name: "LoweredZeroLoop",
907+
aux: "Int64",
908+
argLength: 2,
909+
reg: regInfo{
910+
inputs: []regMask{gp},
911+
clobbersArg0: true,
912+
},
913+
clobberFlags: true,
914+
faultOnNilArg0: true,
915+
needIntTemp: true,
901916
},
902917

903918
// arg0 = address of memory to zero

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 21 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/regalloc_test.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package ssa
66

77
import (
88
"cmd/compile/internal/types"
9+
"fmt"
910
"testing"
1011
)
1112

@@ -218,10 +219,37 @@ func TestSpillMove2(t *testing.T) {
218219

219220
}
220221

222+
func TestClobbersArg0(t *testing.T) {
223+
c := testConfig(t)
224+
f := c.Fun("entry",
225+
Bloc("entry",
226+
Valu("mem", OpInitMem, types.TypeMem, 0, nil),
227+
Valu("ptr", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
228+
Valu("dst", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
229+
Valu("zero", OpAMD64LoweredZeroLoop, types.TypeMem, 256, nil, "ptr", "mem"),
230+
Valu("store", OpAMD64MOVQstore, types.TypeMem, 0, nil, "dst", "ptr", "zero"),
231+
Exit("store")))
232+
flagalloc(f.f)
233+
regalloc(f.f)
234+
checkFunc(f.f)
235+
// LoweredZeroLoop clobbers its argument, so there must be a copy of "ptr" somewhere
236+
// so we still have that value available at "store".
237+
if n := numCopies(f.blocks["entry"]); n != 1 {
238+
fmt.Printf("%s\n", f.f.String())
239+
t.Errorf("got %d copies, want 1", n)
240+
}
241+
}
242+
221243
func numSpills(b *Block) int {
244+
return numOps(b, OpStoreReg)
245+
}
246+
func numCopies(b *Block) int {
247+
return numOps(b, OpCopy)
248+
}
249+
func numOps(b *Block, op Op) int {
222250
n := 0
223251
for _, v := range b.Values {
224-
if v.Op == OpStoreReg {
252+
if v.Op == op {
225253
n++
226254
}
227255
}

src/cmd/compile/internal/ssa/rewrite.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ type deadValueChoice bool
2929
const (
3030
leaveDeadValues deadValueChoice = false
3131
removeDeadValues = true
32+
33+
repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
3234
)
3335

3436
// deadcode indicates whether rewrite should try to remove any values that become dead.

0 commit comments

Comments
 (0)