Skip to content

Commit 28aa529

Browse files
committed
cmd/compile: use generated loops instead of DUFFZERO on arm64
Change-Id: Ie0c8263f36d1bcfd0edfc4ea6710ae6c113c4d48 Reviewed-on: https://go-review.googlesource.com/c/go/+/678995 Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Jorropo <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Knyszek <[email protected]>
1 parent ec9e117 commit 28aa529

File tree

5 files changed

+182
-217
lines changed

5 files changed

+182
-217
lines changed

src/cmd/compile/internal/arm64/ssa.go

Lines changed: 143 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,33 +1050,118 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
10501050
p.From.Offset = int64(condCode)
10511051
p.To.Type = obj.TYPE_REG
10521052
p.To.Reg = v.Reg()
1053-
case ssa.OpARM64DUFFZERO:
1054-
// runtime.duffzero expects start address in R20
1055-
p := s.Prog(obj.ADUFFZERO)
1056-
p.To.Type = obj.TYPE_MEM
1057-
p.To.Name = obj.NAME_EXTERN
1058-
p.To.Sym = ir.Syms.Duffzero
1059-
p.To.Offset = v.AuxInt
10601053
case ssa.OpARM64LoweredZero:
1061-
// STP.P (ZR,ZR), 16(R16)
1062-
// CMP Rarg1, R16
1063-
// BLE -2(PC)
1064-
// arg1 is the address of the last 16-byte unit to zero
1065-
p := s.Prog(arm64.ASTP)
1066-
p.Scond = arm64.C_XPOST
1067-
p.From.Type = obj.TYPE_REGREG
1068-
p.From.Reg = arm64.REGZERO
1069-
p.From.Offset = int64(arm64.REGZERO)
1070-
p.To.Type = obj.TYPE_MEM
1071-
p.To.Reg = arm64.REG_R16
1072-
p.To.Offset = 16
1073-
p2 := s.Prog(arm64.ACMP)
1074-
p2.From.Type = obj.TYPE_REG
1075-
p2.From.Reg = v.Args[1].Reg()
1076-
p2.Reg = arm64.REG_R16
1077-
p3 := s.Prog(arm64.ABLE)
1078-
p3.To.Type = obj.TYPE_BRANCH
1079-
p3.To.SetTarget(p)
1054+
ptrReg := v.Args[0].Reg()
1055+
n := v.AuxInt
1056+
if n < 16 {
1057+
v.Fatalf("Zero too small %d", n)
1058+
}
1059+
1060+
// Generate zeroing instructions.
1061+
var off int64
1062+
for n >= 16 {
1063+
// STP (ZR, ZR), off(ptrReg)
1064+
zero16(s, ptrReg, off, false)
1065+
off += 16
1066+
n -= 16
1067+
}
1068+
// Write any fractional portion.
1069+
// An overlapping 16-byte write can't be used here
1070+
// because STP's offsets must be a multiple of 8.
1071+
if n > 8 {
1072+
// MOVD ZR, off(ptrReg)
1073+
zero8(s, ptrReg, off)
1074+
off += 8
1075+
n -= 8
1076+
}
1077+
if n != 0 {
1078+
// MOVD ZR, off+n-8(ptrReg)
1079+
// TODO: for n<=4 we could use a smaller write.
1080+
zero8(s, ptrReg, off+n-8)
1081+
}
1082+
case ssa.OpARM64LoweredZeroLoop:
1083+
ptrReg := v.Args[0].Reg()
1084+
countReg := v.RegTmp()
1085+
n := v.AuxInt
1086+
loopSize := int64(64)
1087+
if n < 3*loopSize {
1088+
// - a loop count of 0 won't work.
1089+
// - a loop count of 1 is useless.
1090+
// - a loop count of 2 is a code size ~tie
1091+
// 3 instructions to implement the loop
1092+
// 4 instructions in the loop body
1093+
// vs
1094+
// 8 instructions in the straightline code
1095+
// Might as well use straightline code.
1096+
v.Fatalf("ZeroLoop size too small %d", n)
1097+
}
1098+
1099+
// Put iteration count in a register.
1100+
// MOVD $n, countReg
1101+
p := s.Prog(arm64.AMOVD)
1102+
p.From.Type = obj.TYPE_CONST
1103+
p.From.Offset = n / loopSize
1104+
p.To.Type = obj.TYPE_REG
1105+
p.To.Reg = countReg
1106+
cntInit := p
1107+
1108+
// Zero loopSize bytes starting at ptrReg.
1109+
// Increment ptrReg by loopSize as a side effect.
1110+
for range loopSize / 16 {
1111+
// STP.P (ZR, ZR), 16(ptrReg)
1112+
zero16(s, ptrReg, 0, true)
1113+
// TODO: should we use the postincrement form,
1114+
// or use a separate += 64 instruction?
1115+
// postincrement saves an instruction, but maybe
1116+
// it requires more integer units to do the +=16s.
1117+
}
1118+
// Decrement loop count.
1119+
// SUB $1, countReg
1120+
p = s.Prog(arm64.ASUB)
1121+
p.From.Type = obj.TYPE_CONST
1122+
p.From.Offset = 1
1123+
p.To.Type = obj.TYPE_REG
1124+
p.To.Reg = countReg
1125+
// Jump to loop header if we're not done yet.
1126+
// CBNZ head
1127+
p = s.Prog(arm64.ACBNZ)
1128+
p.From.Type = obj.TYPE_REG
1129+
p.From.Reg = countReg
1130+
p.To.Type = obj.TYPE_BRANCH
1131+
p.To.SetTarget(cntInit.Link)
1132+
1133+
// Multiples of the loop size are now done.
1134+
n %= loopSize
1135+
1136+
// Write any fractional portion.
1137+
var off int64
1138+
for n >= 16 {
1139+
// STP (ZR, ZR), off(ptrReg)
1140+
zero16(s, ptrReg, off, false)
1141+
off += 16
1142+
n -= 16
1143+
}
1144+
if n > 8 {
1145+
// Note: an overlapping 16-byte write can't be used
1146+
// here because STP's offsets must be a multiple of 8.
1147+
// MOVD ZR, off(ptrReg)
1148+
zero8(s, ptrReg, off)
1149+
off += 8
1150+
n -= 8
1151+
}
1152+
if n != 0 {
1153+
// MOVD ZR, off+n-8(ptrReg)
1154+
// TODO: for n<=4 we could use a smaller write.
1155+
zero8(s, ptrReg, off+n-8)
1156+
}
1157+
// TODO: maybe we should use the count register to instead
1158+
// hold an end pointer and compare against that?
1159+
// ADD $n, ptrReg, endReg
1160+
// then
1161+
// CMP ptrReg, endReg
1162+
// BNE loop
1163+
// There's a past-the-end pointer here, any problem with that?
1164+
10801165
case ssa.OpARM64DUFFCOPY:
10811166
p := s.Prog(obj.ADUFFCOPY)
10821167
p.To.Type = obj.TYPE_MEM
@@ -1482,3 +1567,35 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
14821567
p.Pos = p.Pos.WithNotStmt()
14831568
return p
14841569
}
1570+
1571+
// zero16 zeroes 16 bytes at reg+off.
1572+
// If postInc is true, increment reg by 16.
1573+
func zero16(s *ssagen.State, reg int16, off int64, postInc bool) {
1574+
// STP (ZR, ZR), off(reg)
1575+
p := s.Prog(arm64.ASTP)
1576+
p.From.Type = obj.TYPE_REGREG
1577+
p.From.Reg = arm64.REGZERO
1578+
p.From.Offset = int64(arm64.REGZERO)
1579+
p.To.Type = obj.TYPE_MEM
1580+
p.To.Reg = reg
1581+
p.To.Offset = off
1582+
if postInc {
1583+
if off != 0 {
1584+
panic("can't postinc with non-zero offset")
1585+
}
1586+
// STP.P (ZR, ZR), 16(reg)
1587+
p.Scond = arm64.C_XPOST
1588+
p.To.Offset = 16
1589+
}
1590+
}
1591+
1592+
// zero8 zeroes 8 bytes at reg+off.
1593+
func zero8(s *ssagen.State, reg int16, off int64) {
1594+
// MOVD ZR, off(reg)
1595+
p := s.Prog(arm64.AMOVD)
1596+
p.From.Type = obj.TYPE_REG
1597+
p.From.Reg = arm64.REGZERO
1598+
p.To.Type = obj.TYPE_MEM
1599+
p.To.Reg = reg
1600+
p.To.Offset = off
1601+
}

src/cmd/compile/internal/ssa/_gen/ARM64.rules

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -392,44 +392,8 @@
392392
(Zero [16] ptr mem) =>
393393
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
394394

395-
(Zero [32] ptr mem) =>
396-
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
397-
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
398-
399-
(Zero [48] ptr mem) =>
400-
(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
401-
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
402-
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
403-
404-
(Zero [64] ptr mem) =>
405-
(STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
406-
(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
407-
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
408-
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
409-
410-
// strip off fractional word zeroing
411-
(Zero [s] ptr mem) && s%16 != 0 && s%16 <= 8 && s > 16 =>
412-
(Zero [8]
413-
(OffPtr <ptr.Type> ptr [s-8])
414-
(Zero [s-s%16] ptr mem))
415-
(Zero [s] ptr mem) && s%16 != 0 && s%16 > 8 && s > 16 =>
416-
(Zero [16]
417-
(OffPtr <ptr.Type> ptr [s-16])
418-
(Zero [s-s%16] ptr mem))
419-
420-
// medium zeroing uses a duff device
421-
// 4, 16, and 64 are magic constants, see runtime/mkduff.go
422-
(Zero [s] ptr mem)
423-
&& s%16 == 0 && s > 64 && s <= 16*64 =>
424-
(DUFFZERO [4 * (64 - s/16)] ptr mem)
425-
426-
// large zeroing uses a loop
427-
(Zero [s] ptr mem)
428-
&& s%16 == 0 && s > 16*64 =>
429-
(LoweredZero
430-
ptr
431-
(ADDconst <ptr.Type> [s-16] ptr)
432-
mem)
395+
(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
396+
(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
433397

434398
// moves
435399
(Move [0] _ _ mem) => mem

src/cmd/compile/internal/ssa/_gen/ARM64Ops.go

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -536,44 +536,36 @@ func init() {
536536
{name: "LessThanNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x<y but without honoring overflow, false otherwise.
537537
{name: "GreaterEqualNoov", argLength: 1, reg: readflags}, // bool, true flags encode signed x>=y but without honoring overflow, false otherwise.
538538

539-
// duffzero
539+
// medium zeroing
540540
// arg0 = address of memory to zero
541541
// arg1 = mem
542-
// auxint = offset into duffzero code to start executing
542+
// auxint = # of bytes to zero
543543
// returns mem
544-
// R20 changed as side effect
545-
// R16 and R17 may be clobbered by linker trampoline.
546544
{
547-
name: "DUFFZERO",
545+
name: "LoweredZero",
548546
aux: "Int64",
549547
argLength: 2,
550548
reg: regInfo{
551-
inputs: []regMask{buildReg("R20")},
552-
clobbers: buildReg("R16 R17 R20 R30"),
549+
inputs: []regMask{gp},
553550
},
554-
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
555-
unsafePoint: true, // FP maintenance around DUFFZERO can be clobbered by interrupts
551+
faultOnNilArg0: true,
556552
},
557553

558554
// large zeroing
559-
// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
560-
// arg1 = address of the last 16-byte unit to zero
561-
// arg2 = mem
555+
// arg0 = address of memory to zero
556+
// arg1 = mem
557+
// auxint = # of bytes to zero
562558
// returns mem
563-
// STP.P (ZR,ZR), 16(R16)
564-
// CMP Rarg1, R16
565-
// BLE -2(PC)
566-
// Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
567-
// the-end-of-the-memory - 16 is with the area to zero, ok to spill.
568559
{
569-
name: "LoweredZero",
570-
argLength: 3,
560+
name: "LoweredZeroLoop",
561+
aux: "Int64",
562+
argLength: 2,
571563
reg: regInfo{
572-
inputs: []regMask{buildReg("R16"), gp},
573-
clobbers: buildReg("R16"),
564+
inputs: []regMask{gp},
565+
clobbersArg0: true,
574566
},
575-
clobberFlags: true,
576567
faultOnNilArg0: true,
568+
needIntTemp: true,
577569
},
578570

579571
// duffcopy

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 12 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)