Skip to content

Commit 4dac9e0

Browse files
committed
cmd/compile: use generated loops instead of DUFFCOPY on riscv64
MemmoveKnownSize112-4 632.1Mi ± 1% 1288.5Mi ± 0% +103.85% (p=0.000 n=10) MemmoveKnownSize128-4 636.1Mi ± 0% 1280.9Mi ± 1% +101.36% (p=0.000 n=10) MemmoveKnownSize192-4 645.3Mi ± 0% 1306.9Mi ± 1% +102.53% (p=0.000 n=10) MemmoveKnownSize248-4 650.2Mi ± 2% 1312.5Mi ± 1% +101.87% (p=0.000 n=10) MemmoveKnownSize256-4 650.7Mi ± 0% 1303.6Mi ± 1% +100.33% (p=0.000 n=10) MemmoveKnownSize512-4 658.2Mi ± 1% 1293.9Mi ± 0% +96.60% (p=0.000 n=10) MemmoveKnownSize1024-4 662.1Mi ± 0% 1312.6Mi ± 0% +98.26% (p=0.000 n=10) Change-Id: I43681ca029880025558b33ddc4295da3947c9b28 Reviewed-on: https://go-review.googlesource.com/c/go/+/700537 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Mark Freeman <[email protected]>
1 parent 879ff73 commit 4dac9e0

File tree

5 files changed

+187
-223
lines changed

5 files changed

+187
-223
lines changed

src/cmd/compile/internal/riscv64/ssa.go

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -822,44 +822,99 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
822822
}
823823

824824
case ssa.OpRISCV64LoweredMove:
825-
mov, sz := largestMove(v.AuxInt)
825+
dst := v.Args[0].Reg()
826+
src := v.Args[1].Reg()
827+
if dst == src {
828+
break
829+
}
826830

827-
// mov (Rarg1), T2
828-
// mov T2, (Rarg0)
829-
// ADD $sz, Rarg0
830-
// ADD $sz, Rarg1
831-
// BGEU Rarg2, Rarg0, -4(PC)
831+
sa := v.AuxValAndOff()
832+
n := sa.Val64()
833+
mov, sz := largestMove(sa.Off64())
832834

833-
p := s.Prog(mov)
834-
p.From.Type = obj.TYPE_MEM
835-
p.From.Reg = v.Args[1].Reg()
835+
var off int64
836+
tmp := int16(riscv.REG_X5)
837+
for n >= sz {
838+
moveOp(s, mov, dst, src, tmp, off)
839+
off += sz
840+
n -= sz
841+
}
842+
843+
for i := len(fracMovOps) - 1; i >= 0; i-- {
844+
tsz := int64(1 << i)
845+
if n < tsz {
846+
continue
847+
}
848+
moveOp(s, fracMovOps[i], dst, src, tmp, off)
849+
off += tsz
850+
n -= tsz
851+
}
852+
853+
case ssa.OpRISCV64LoweredMoveLoop:
854+
dst := v.Args[0].Reg()
855+
src := v.Args[1].Reg()
856+
if dst == src {
857+
break
858+
}
859+
860+
sc := v.AuxValAndOff()
861+
n := sc.Val64()
862+
mov, sz := largestMove(sc.Off64())
863+
chunk := 8 * sz
864+
865+
if n <= 3*chunk {
866+
v.Fatalf("MoveLoop too small:%d, expect:%d", n, 3*chunk)
867+
}
868+
tmp := int16(riscv.REG_X5)
869+
870+
p := s.Prog(riscv.AADD)
871+
p.From.Type = obj.TYPE_CONST
872+
p.From.Offset = n - n%chunk
873+
p.Reg = src
836874
p.To.Type = obj.TYPE_REG
837-
p.To.Reg = riscv.REG_T2
875+
p.To.Reg = riscv.REG_X6
838876

839-
p2 := s.Prog(mov)
840-
p2.From.Type = obj.TYPE_REG
841-
p2.From.Reg = riscv.REG_T2
842-
p2.To.Type = obj.TYPE_MEM
843-
p2.To.Reg = v.Args[0].Reg()
844-
845-
p3 := s.Prog(riscv.AADD)
846-
p3.From.Type = obj.TYPE_CONST
847-
p3.From.Offset = sz
848-
p3.To.Type = obj.TYPE_REG
849-
p3.To.Reg = v.Args[0].Reg()
850-
851-
p4 := s.Prog(riscv.AADD)
852-
p4.From.Type = obj.TYPE_CONST
853-
p4.From.Offset = sz
854-
p4.To.Type = obj.TYPE_REG
855-
p4.To.Reg = v.Args[1].Reg()
877+
for i := int64(0); i < 8; i++ {
878+
moveOp(s, mov, dst, src, tmp, sz*i)
879+
}
856880

857-
p5 := s.Prog(riscv.ABGEU)
858-
p5.To.Type = obj.TYPE_BRANCH
859-
p5.Reg = v.Args[1].Reg()
860-
p5.From.Type = obj.TYPE_REG
861-
p5.From.Reg = v.Args[2].Reg()
862-
p5.To.SetTarget(p)
881+
p1 := s.Prog(riscv.AADD)
882+
p1.From.Type = obj.TYPE_CONST
883+
p1.From.Offset = chunk
884+
p1.To.Type = obj.TYPE_REG
885+
p1.To.Reg = src
886+
887+
p2 := s.Prog(riscv.AADD)
888+
p2.From.Type = obj.TYPE_CONST
889+
p2.From.Offset = chunk
890+
p2.To.Type = obj.TYPE_REG
891+
p2.To.Reg = dst
892+
893+
p3 := s.Prog(riscv.ABNE)
894+
p3.From.Reg = riscv.REG_X6
895+
p3.From.Type = obj.TYPE_REG
896+
p3.Reg = src
897+
p3.To.Type = obj.TYPE_BRANCH
898+
p3.To.SetTarget(p.Link)
899+
900+
n %= chunk
901+
902+
var off int64
903+
for n >= sz {
904+
moveOp(s, mov, dst, src, tmp, off)
905+
off += sz
906+
n -= sz
907+
}
908+
909+
for i := len(fracMovOps) - 1; i >= 0; i-- {
910+
tsz := int64(1 << i)
911+
if n < tsz {
912+
continue
913+
}
914+
moveOp(s, fracMovOps[i], dst, src, tmp, off)
915+
off += tsz
916+
n -= tsz
917+
}
863918

864919
case ssa.OpRISCV64LoweredNilCheck:
865920
// Issue a load which will fault if arg is nil.
@@ -1023,3 +1078,21 @@ func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) {
10231078
p.To.Offset = off
10241079
return
10251080
}
1081+
1082+
func moveOp(s *ssagen.State, mov obj.As, dst int16, src int16, tmp int16, off int64) {
1083+
p := s.Prog(mov)
1084+
p.From.Type = obj.TYPE_MEM
1085+
p.From.Reg = src
1086+
p.From.Offset = off
1087+
p.To.Type = obj.TYPE_REG
1088+
p.To.Reg = tmp
1089+
1090+
p1 := s.Prog(mov)
1091+
p1.From.Type = obj.TYPE_REG
1092+
p1.From.Reg = tmp
1093+
p1.To.Type = obj.TYPE_MEM
1094+
p1.To.Reg = dst
1095+
p1.To.Offset = off
1096+
1097+
return
1098+
}

src/cmd/compile/internal/ssa/_gen/RISCV64.rules

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -442,37 +442,16 @@
442442
(MOVHstore [4] dst (MOVHload [4] src mem)
443443
(MOVHstore [2] dst (MOVHload [2] src mem)
444444
(MOVHstore dst (MOVHload src mem) mem)))
445-
(Move [12] {t} dst src mem) && t.Alignment()%4 == 0 =>
446-
(MOVWstore [8] dst (MOVWload [8] src mem)
447-
(MOVWstore [4] dst (MOVWload [4] src mem)
448-
(MOVWstore dst (MOVWload src mem) mem)))
449-
(Move [16] {t} dst src mem) && t.Alignment()%8 == 0 =>
450-
(MOVDstore [8] dst (MOVDload [8] src mem)
451-
(MOVDstore dst (MOVDload src mem) mem))
452-
(Move [24] {t} dst src mem) && t.Alignment()%8 == 0 =>
453-
(MOVDstore [16] dst (MOVDload [16] src mem)
454-
(MOVDstore [8] dst (MOVDload [8] src mem)
455-
(MOVDstore dst (MOVDload src mem) mem)))
456-
(Move [32] {t} dst src mem) && t.Alignment()%8 == 0 =>
457-
(MOVDstore [24] dst (MOVDload [24] src mem)
458-
(MOVDstore [16] dst (MOVDload [16] src mem)
459-
(MOVDstore [8] dst (MOVDload [8] src mem)
460-
(MOVDstore dst (MOVDload src mem) mem))))
461-
462-
// Medium 8-aligned move uses a Duff's device
463-
// 16 and 128 are magic constants, see runtime/mkduff.go
464-
(Move [s] {t} dst src mem)
465-
&& s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0
445+
446+
// Generic move
447+
(Move [s] {t} dst src mem) && s > 0 && s <= 3*8*moveSize(t.Alignment(), config)
466448
&& logLargeCopy(v, s) =>
467-
(DUFFCOPY [16 * (128 - s/8)] dst src mem)
449+
(LoweredMove [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
468450

469451
// Generic move uses a loop
470-
(Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
471-
(LoweredMove [t.Alignment()]
472-
dst
473-
src
474-
(ADDI <src.Type> [s-moveSize(t.Alignment(), config)] src)
475-
mem)
452+
(Move [s] {t} dst src mem) && s > 3*8*moveSize(t.Alignment(), config)
453+
&& logLargeCopy(v, s) =>
454+
(LoweredMoveLoop [makeValAndOff(int32(s),int32(t.Alignment()))] dst src mem)
476455

477456
// Boolean ops; 0=false, 1=true
478457
(AndB ...) => (AND ...)

src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ func init() {
117117

118118
regCtxt := regNamed["X26"]
119119
callerSave := gpMask | fpMask | regNamed["g"]
120+
r5toR6 := regNamed["X5"] | regNamed["X6"]
120121

121122
var (
122123
gpstore = regInfo{inputs: []regMask{gpspsbMask, gpspMask, 0}} // SB in first input so we can load from a global, but not in second to avoid using SB as a temporary register
@@ -354,27 +355,51 @@ func init() {
354355
},
355356

356357
// general unaligned move
357-
// arg0 = address of dst memory (in X5, changed as side effect)
358-
// arg1 = address of src memory (in X6, changed as side effect)
359-
// arg2 = address of the last element of src (can't be X7 as we clobber it before using arg2)
358+
// arg0 = address of dst memory (clobber)
359+
// arg1 = address of src memory (clobber)
360+
// arg2 = mem
361+
// auxint = size and type alignment
362+
// returns mem
363+
// mov (offset)(Rarg1), TMP
364+
// mov TMP, (offset)(Rarg0)
365+
{
366+
name: "LoweredMove",
367+
aux: "SymValAndOff",
368+
symEffect: "Write",
369+
argLength: 3,
370+
reg: regInfo{
371+
inputs: []regMask{gpMask &^ regNamed["X5"], gpMask &^ regNamed["X5"]},
372+
clobbers: regNamed["X5"],
373+
},
374+
faultOnNilArg0: true,
375+
faultOnNilArg1: true,
376+
},
377+
378+
// general unaligned move
379+
// arg0 = address of dst memory (clobber)
380+
// arg1 = address of src memory (clobber)
360381
// arg3 = mem
361382
// auxint = alignment
362-
// clobbers X7 as a tmp register.
363383
// returns mem
364-
// mov (X6), X7
365-
// mov X7, (X5)
366-
// ADD $sz, X5
367384
// ADD $sz, X6
368-
// BGEU Rarg2, X5, -4(PC)
385+
//loop:
386+
// mov (Rarg1), X5
387+
// mov X5, (Rarg0)
388+
// ...rest 7 mov...
389+
// ADD $sz, Rarg0
390+
// ADD $sz, Rarg1
391+
// BNE X6, Rarg1, loop
369392
{
370-
name: "LoweredMove",
371-
aux: "Int64",
372-
argLength: 4,
393+
name: "LoweredMoveLoop",
394+
aux: "SymValAndOff",
395+
argLength: 3,
396+
symEffect: "Write",
373397
reg: regInfo{
374-
inputs: []regMask{regNamed["X5"], regNamed["X6"], gpMask &^ regNamed["X7"]},
375-
clobbers: regNamed["X5"] | regNamed["X6"] | regNamed["X7"],
398+
inputs: []regMask{gpMask &^ r5toR6, gpMask &^ r5toR6},
399+
clobbers: r5toR6,
400+
clobbersArg0: true,
401+
clobbersArg1: true,
376402
},
377-
typ: "Mem",
378403
faultOnNilArg0: true,
379404
faultOnNilArg1: true,
380405
},

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 24 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)