Skip to content

Commit 15d6dbc

Browse files
committed
cmd/compile: use generated loops instead of DUFFCOPY on arm64
Change-Id: Ic2aa8959b7fc594b86def70b6c2be38badf7970c Reviewed-on: https://go-review.googlesource.com/c/go/+/679015 Reviewed-by: Keith Randall <[email protected]> Reviewed-by: David Chase <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jorropo <[email protected]>
1 parent bca3e98 commit 15d6dbc

File tree

5 files changed

+215
-173
lines changed

5 files changed

+215
-173
lines changed

src/cmd/compile/internal/arm64/ssa.go

Lines changed: 162 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,41 +1162,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11621162
// BNE loop
11631163
// There's a past-the-end pointer here, any problem with that?
11641164

1165-
case ssa.OpARM64DUFFCOPY:
1166-
p := s.Prog(obj.ADUFFCOPY)
1167-
p.To.Type = obj.TYPE_MEM
1168-
p.To.Name = obj.NAME_EXTERN
1169-
p.To.Sym = ir.Syms.Duffcopy
1170-
p.To.Offset = v.AuxInt
11711165
case ssa.OpARM64LoweredMove:
1172-
// LDP.P 16(R16), (R25, Rtmp)
1173-
// STP.P (R25, Rtmp), 16(R17)
1174-
// CMP Rarg2, R16
1175-
// BLE -3(PC)
1176-
// arg2 is the address of the last element of src
1177-
p := s.Prog(arm64.ALDP)
1178-
p.Scond = arm64.C_XPOST
1179-
p.From.Type = obj.TYPE_MEM
1180-
p.From.Reg = arm64.REG_R16
1181-
p.From.Offset = 16
1182-
p.To.Type = obj.TYPE_REGREG
1183-
p.To.Reg = arm64.REG_R25
1184-
p.To.Offset = int64(arm64.REGTMP)
1185-
p2 := s.Prog(arm64.ASTP)
1186-
p2.Scond = arm64.C_XPOST
1187-
p2.From.Type = obj.TYPE_REGREG
1188-
p2.From.Reg = arm64.REG_R25
1189-
p2.From.Offset = int64(arm64.REGTMP)
1190-
p2.To.Type = obj.TYPE_MEM
1191-
p2.To.Reg = arm64.REG_R17
1192-
p2.To.Offset = 16
1193-
p3 := s.Prog(arm64.ACMP)
1194-
p3.From.Type = obj.TYPE_REG
1195-
p3.From.Reg = v.Args[2].Reg()
1196-
p3.Reg = arm64.REG_R16
1197-
p4 := s.Prog(arm64.ABLE)
1198-
p4.To.Type = obj.TYPE_BRANCH
1199-
p4.To.SetTarget(p)
1166+
dstReg := v.Args[0].Reg()
1167+
srcReg := v.Args[1].Reg()
1168+
if dstReg == srcReg {
1169+
break
1170+
}
1171+
tmpReg1 := int16(arm64.REG_R24)
1172+
tmpReg2 := int16(arm64.REG_R25)
1173+
n := v.AuxInt
1174+
if n < 16 {
1175+
v.Fatalf("Move too small %d", n)
1176+
}
1177+
1178+
// Generate copying instructions.
1179+
var off int64
1180+
for n >= 16 {
1181+
// LDP off(srcReg), (tmpReg1, tmpReg2)
1182+
// STP (tmpReg1, tmpReg2), off(dstReg)
1183+
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
1184+
off += 16
1185+
n -= 16
1186+
}
1187+
if n > 8 {
1188+
// MOVD off(srcReg), tmpReg1
1189+
// MOVD tmpReg1, off(dstReg)
1190+
move8(s, srcReg, dstReg, tmpReg1, off)
1191+
off += 8
1192+
n -= 8
1193+
}
1194+
if n != 0 {
1195+
// MOVD off+n-8(srcReg), tmpReg1
1196+
// MOVD tmpReg1, off+n-8(dstReg)
1197+
move8(s, srcReg, dstReg, tmpReg1, off+n-8)
1198+
}
1199+
case ssa.OpARM64LoweredMoveLoop:
1200+
dstReg := v.Args[0].Reg()
1201+
srcReg := v.Args[1].Reg()
1202+
if dstReg == srcReg {
1203+
break
1204+
}
1205+
countReg := int16(arm64.REG_R23)
1206+
tmpReg1 := int16(arm64.REG_R24)
1207+
tmpReg2 := int16(arm64.REG_R25)
1208+
n := v.AuxInt
1209+
loopSize := int64(64)
1210+
if n < 3*loopSize {
1211+
// - a loop count of 0 won't work.
1212+
// - a loop count of 1 is useless.
1213+
// - a loop count of 2 is a code size ~tie
1214+
// 3 instructions to implement the loop
1215+
// 4 instructions in the loop body
1216+
// vs
1217+
// 8 instructions in the straightline code
1218+
// Might as well use straightline code.
1219+
v.Fatalf("ZeroLoop size too small %d", n)
1220+
}
1221+
1222+
// Put iteration count in a register.
1223+
// MOVD $n, countReg
1224+
p := s.Prog(arm64.AMOVD)
1225+
p.From.Type = obj.TYPE_CONST
1226+
p.From.Offset = n / loopSize
1227+
p.To.Type = obj.TYPE_REG
1228+
p.To.Reg = countReg
1229+
cntInit := p
1230+
1231+
// Move loopSize bytes starting at srcReg to dstReg.
1232+
// Increment srcReg and destReg by loopSize as a side effect.
1233+
for range loopSize / 16 {
1234+
// LDP.P 16(srcReg), (tmpReg1, tmpReg2)
1235+
// STP.P (tmpReg1, tmpReg2), 16(dstReg)
1236+
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
1237+
}
1238+
// Decrement loop count.
1239+
// SUB $1, countReg
1240+
p = s.Prog(arm64.ASUB)
1241+
p.From.Type = obj.TYPE_CONST
1242+
p.From.Offset = 1
1243+
p.To.Type = obj.TYPE_REG
1244+
p.To.Reg = countReg
1245+
// Jump to loop header if we're not done yet.
1246+
// CBNZ head
1247+
p = s.Prog(arm64.ACBNZ)
1248+
p.From.Type = obj.TYPE_REG
1249+
p.From.Reg = countReg
1250+
p.To.Type = obj.TYPE_BRANCH
1251+
p.To.SetTarget(cntInit.Link)
1252+
1253+
// Multiples of the loop size are now done.
1254+
n %= loopSize
1255+
1256+
// Copy any fractional portion.
1257+
var off int64
1258+
for n >= 16 {
1259+
// LDP off(srcReg), (tmpReg1, tmpReg2)
1260+
// STP (tmpReg1, tmpReg2), off(dstReg)
1261+
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
1262+
off += 16
1263+
n -= 16
1264+
}
1265+
if n > 8 {
1266+
// MOVD off(srcReg), tmpReg1
1267+
// MOVD tmpReg1, off(dstReg)
1268+
move8(s, srcReg, dstReg, tmpReg1, off)
1269+
off += 8
1270+
n -= 8
1271+
}
1272+
if n != 0 {
1273+
// MOVD off+n-8(srcReg), tmpReg1
1274+
// MOVD tmpReg1, off+n-8(dstReg)
1275+
move8(s, srcReg, dstReg, tmpReg1, off+n-8)
1276+
}
1277+
12001278
case ssa.OpARM64CALLstatic, ssa.OpARM64CALLclosure, ssa.OpARM64CALLinter:
12011279
s.Call(v)
12021280
case ssa.OpARM64CALLtail:
@@ -1599,3 +1677,53 @@ func zero8(s *ssagen.State, reg int16, off int64) {
15991677
p.To.Reg = reg
16001678
p.To.Offset = off
16011679
}
1680+
1681+
// move16 copies 16 bytes at src+off to dst+off.
1682+
// Uses registers tmp1 and tmp2.
1683+
// If postInc is true, increment src and dst by 16.
1684+
func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
1685+
// LDP off(src), (tmp1, tmp2)
1686+
ld := s.Prog(arm64.ALDP)
1687+
ld.From.Type = obj.TYPE_MEM
1688+
ld.From.Reg = src
1689+
ld.From.Offset = off
1690+
ld.To.Type = obj.TYPE_REGREG
1691+
ld.To.Reg = tmp1
1692+
ld.To.Offset = int64(tmp2)
1693+
// STP (tmp1, tmp2), off(dst)
1694+
st := s.Prog(arm64.ASTP)
1695+
st.From.Type = obj.TYPE_REGREG
1696+
st.From.Reg = tmp1
1697+
st.From.Offset = int64(tmp2)
1698+
st.To.Type = obj.TYPE_MEM
1699+
st.To.Reg = dst
1700+
st.To.Offset = off
1701+
if postInc {
1702+
if off != 0 {
1703+
panic("can't postinc with non-zero offset")
1704+
}
1705+
ld.Scond = arm64.C_XPOST
1706+
st.Scond = arm64.C_XPOST
1707+
ld.From.Offset = 16
1708+
st.To.Offset = 16
1709+
}
1710+
}
1711+
1712+
// move8 copies 8 bytes at src+off to dst+off.
1713+
// Uses register tmp.
1714+
func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
1715+
// MOVD off(src), tmp
1716+
ld := s.Prog(arm64.AMOVD)
1717+
ld.From.Type = obj.TYPE_MEM
1718+
ld.From.Reg = src
1719+
ld.From.Offset = off
1720+
ld.To.Type = obj.TYPE_REG
1721+
ld.To.Reg = tmp
1722+
// MOVD tmp, off(dst)
1723+
st := s.Prog(arm64.AMOVD)
1724+
st.From.Type = obj.TYPE_REG
1725+
st.From.Reg = tmp
1726+
st.To.Type = obj.TYPE_MEM
1727+
st.To.Reg = dst
1728+
st.To.Offset = off
1729+
}

src/cmd/compile/internal/ssa/_gen/ARM64.rules

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -462,39 +462,8 @@
462462
(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
463463
(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
464464

465-
// strip off fractional word move
466-
(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 64 =>
467-
(Move [8]
468-
(OffPtr <dst.Type> dst [s-8])
469-
(OffPtr <src.Type> src [s-8])
470-
(Move [s-s%16] dst src mem))
471-
(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 64 =>
472-
(Move [16]
473-
(OffPtr <dst.Type> dst [s-16])
474-
(OffPtr <src.Type> src [s-16])
475-
(Move [s-s%16] dst src mem))
476-
477-
// medium move uses a duff device
478-
(Move [s] dst src mem)
479-
&& s > 64 && s <= 16*64 && s%16 == 0
480-
&& logLargeCopy(v, s) =>
481-
(DUFFCOPY [8 * (64 - s/16)] dst src mem)
482-
// 8 is the number of bytes to encode:
483-
//
484-
// LDP.P 16(R16), (R26, R27)
485-
// STP.P (R26, R27), 16(R17)
486-
//
487-
// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
488-
489-
// large move uses a loop
490-
(Move [s] dst src mem)
491-
&& s%16 == 0 && s > 16*64
492-
&& logLargeCopy(v, s) =>
493-
(LoweredMove
494-
dst
495-
src
496-
(ADDconst <src.Type> src [s-16])
497-
mem)
465+
(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
466+
(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
498467

499468
// calls
500469
(StaticCall ...) => (CALLstatic ...)

src/cmd/compile/internal/ssa/_gen/ARM64Ops.go

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ func init() {
144144
gpspsbg = gpspg | buildReg("SB")
145145
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
146146
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
147+
r24to25 = buildReg("R24 R25")
148+
r23to25 = buildReg("R23 R24 R25")
147149
rz = buildReg("ZERO")
148150
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
149151
)
@@ -568,47 +570,40 @@ func init() {
568570
needIntTemp: true,
569571
},
570572

571-
// duffcopy
572-
// arg0 = address of dst memory (in R21, changed as side effect)
573-
// arg1 = address of src memory (in R20, changed as side effect)
573+
// medium copying
574+
// arg0 = address of dst memory
575+
// arg1 = address of src memory
574576
// arg2 = mem
575-
// auxint = offset into duffcopy code to start executing
577+
// auxint = # of bytes to copy
576578
// returns mem
577-
// R20, R21 changed as side effect
578-
// R16 and R17 may be clobbered by linker trampoline.
579579
{
580-
name: "DUFFCOPY",
580+
name: "LoweredMove",
581581
aux: "Int64",
582582
argLength: 3,
583583
reg: regInfo{
584-
inputs: []regMask{buildReg("R21"), buildReg("R20")},
585-
clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
584+
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
585+
clobbers: r24to25, // TODO: figure out needIntTemp x2
586586
},
587-
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
588-
//faultOnNilArg1: true,
589-
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
587+
faultOnNilArg0: true,
588+
faultOnNilArg1: true,
590589
},
591590

592-
// large move
593-
// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
594-
// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
595-
// arg2 = address of the last element of src
596-
// arg3 = mem
591+
// large copying
592+
// arg0 = address of dst memory
593+
// arg1 = address of src memory
594+
// arg2 = mem
595+
// auxint = # of bytes to copy
597596
// returns mem
598-
// LDP.P 16(R16), (R25, Rtmp)
599-
// STP.P (R25, Rtmp), 16(R17)
600-
// CMP Rarg2, R16
601-
// BLE -3(PC)
602-
// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
603-
// the-end-of-src - 16 is within the area to copy, ok to spill.
604597
{
605-
name: "LoweredMove",
606-
argLength: 4,
598+
name: "LoweredMoveLoop",
599+
aux: "Int64",
600+
argLength: 3,
607601
reg: regInfo{
608-
inputs: []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")},
609-
clobbers: buildReg("R16 R17 R25"),
602+
inputs: []regMask{gp &^ r23to25, gp &^ r23to25},
603+
clobbers: r23to25, // TODO: figure out needIntTemp x3
604+
clobbersArg0: true,
605+
clobbersArg1: true,
610606
},
611-
clobberFlags: true,
612607
faultOnNilArg0: true,
613608
faultOnNilArg1: true,
614609
},

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 17 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)