Skip to content

Commit 879ff73

Browse files
committed
cmd/compile: use generated loops instead of DUFFZERO on riscv64
MemclrKnownSize112-4 5.602Gi ± 0% 5.601Gi ± 0% ~ (p=0.363 n=10) MemclrKnownSize128-4 6.933Gi ± 1% 6.545Gi ± 1% -5.59% (p=0.000 n=10) MemclrKnownSize192-4 8.055Gi ± 1% 7.804Gi ± 0% -3.12% (p=0.000 n=10) MemclrKnownSize248-4 8.489Gi ± 0% 8.718Gi ± 0% +2.69% (p=0.000 n=10) MemclrKnownSize256-4 8.762Gi ± 0% 8.763Gi ± 0% ~ (p=0.494 n=10) MemclrKnownSize512-4 9.514Gi ± 1% 9.514Gi ± 0% ~ (p=0.529 n=10) MemclrKnownSize1024-4 9.940Gi ± 0% 9.939Gi ± 1% ~ (p=0.989 n=10) ClearFat3-4 1.300Gi ± 0% 1.301Gi ± 0% ~ (p=0.447 n=10) ClearFat4-4 3.902Gi ± 0% 3.902Gi ± 0% ~ (p=0.971 n=10) ClearFat5-4 665.8Mi ± 0% 1331.5Mi ± 0% +100.01% (p=0.000 n=10) ClearFat6-4 665.8Mi ± 0% 1330.5Mi ± 0% +99.82% (p=0.000 n=10) ClearFat7-4 490.7Mi ± 0% 1331.9Mi ± 0% +171.45% (p=0.000 n=10) ClearFat8-4 5.201Gi ± 0% 5.202Gi ± 0% ~ (p=0.123 n=10) ClearFat9-4 856.1Mi ± 0% 1331.6Mi ± 0% +55.54% (p=0.000 n=10) ClearFat10-4 887.8Mi ± 0% 1331.9Mi ± 0% +50.03% (p=0.000 n=10) ClearFat11-4 915.3Mi ± 0% 1331.1Mi ± 0% +45.42% (p=0.000 n=10) ClearFat12-4 5.202Gi ± 0% 5.202Gi ± 0% ~ (p=0.481 n=10) ClearFat13-4 961.5Mi ± 0% 1331.8Mi ± 0% +38.50% (p=0.000 n=10) ClearFat14-4 981.0Mi ± 0% 1331.8Mi ± 0% +35.76% (p=0.000 n=10) ClearFat15-4 951.3Mi ± 0% 1331.4Mi ± 0% +39.96% (p=0.000 n=10) ClearFat16-4 1.600Gi ± 0% 5.202Gi ± 0% +225.10% (p=0.000 n=10) ClearFat18-4 1.018Gi ± 0% 1.300Gi ± 0% +27.77% (p=0.000 n=10) ClearFat20-4 2.601Gi ± 0% 4.938Gi ± 12% +89.87% (p=0.000 n=10) ClearFat24-4 2.601Gi ± 0% 5.201Gi ± 0% +99.96% (p=0.000 n=10) ClearFat32-4 1.982Gi ± 0% 5.203Gi ± 0% +162.55% (p=0.000 n=10) ClearFat40-4 3.467Gi ± 0% 4.338Gi ± 0% +25.11% (p=0.000 n=10) ClearFat48-4 3.671Gi ± 0% 5.201Gi ± 0% +41.69% (p=0.000 n=10) ClearFat56-4 3.640Gi ± 0% 5.201Gi ± 0% +42.88% (p=0.000 n=10) ClearFat64-4 2.250Gi ± 0% 5.202Gi ± 0% +131.25% (p=0.000 n=10) ClearFat72-4 4.064Gi ± 0% 5.201Gi ± 0% +27.97% (p=0.000 n=10) ClearFat128-4 4.496Gi ± 0% 5.203Gi ± 0% +15.71% (p=0.000 n=10) ClearFat256-4 4.756Gi ± 0% 5.201Gi ± 0% +9.36% (p=0.000 n=10) ClearFat512-4 2.512Gi ± 0% 5.201Gi ± 0% +107.03% (p=0.000 n=10) ClearFat1024-4 4.255Gi ± 0% 5.202Gi ± 0% +22.26% (p=0.000 n=10) ClearFat1032-4 4.260Gi ± 0% 5.201Gi ± 0% +22.09% (p=0.000 n=10) ClearFat1040-4 4.285Gi ± 1% 5.203Gi ± 0% +21.41% (p=0.000 n=10) geomean 2.005Gi 3.020Gi +50.58% Change-Id: Iea1da734ff8eaf1b5a2822ae2bdb7f4fd9b65651 Reviewed-on: https://go-review.googlesource.com/c/go/+/699635 Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Mark Freeman <[email protected]>
1 parent 77643dc commit 879ff73

File tree

5 files changed

+151
-175
lines changed

5 files changed

+151
-175
lines changed

src/cmd/compile/internal/riscv64/ssa.go

Lines changed: 84 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ func largestMove(alignment int64) (obj.As, int64) {
181181
}
182182
}
183183

184+
var fracMovOps = []obj.As{riscv.AMOVB, riscv.AMOVH, riscv.AMOVW, riscv.AMOV}
185+
184186
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
185187
// RISC-V has no flags, so this is a no-op.
186188
func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {}
@@ -738,30 +740,86 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
738740
p.RegTo2 = riscv.REG_ZERO
739741

740742
case ssa.OpRISCV64LoweredZero:
741-
mov, sz := largestMove(v.AuxInt)
743+
ptr := v.Args[0].Reg()
744+
sc := v.AuxValAndOff()
745+
n := sc.Val64()
746+
747+
mov, sz := largestMove(sc.Off64())
748+
749+
// mov ZERO, (offset)(Rarg0)
750+
var off int64
751+
for n >= sz {
752+
zeroOp(s, mov, ptr, off)
753+
off += sz
754+
n -= sz
755+
}
742756

743-
// mov ZERO, (Rarg0)
744-
// ADD $sz, Rarg0
745-
// BGEU Rarg1, Rarg0, -2(PC)
757+
for i := len(fracMovOps) - 1; i >= 0; i-- {
758+
tsz := int64(1 << i)
759+
if n < tsz {
760+
continue
761+
}
762+
zeroOp(s, fracMovOps[i], ptr, off)
763+
off += tsz
764+
n -= tsz
765+
}
746766

747-
p := s.Prog(mov)
748-
p.From.Type = obj.TYPE_REG
749-
p.From.Reg = riscv.REG_ZERO
750-
p.To.Type = obj.TYPE_MEM
751-
p.To.Reg = v.Args[0].Reg()
767+
case ssa.OpRISCV64LoweredZeroLoop:
768+
ptr := v.Args[0].Reg()
769+
sc := v.AuxValAndOff()
770+
n := sc.Val64()
771+
mov, sz := largestMove(sc.Off64())
772+
chunk := 8 * sz
773+
774+
if n <= 3*chunk {
775+
v.Fatalf("ZeroLoop too small:%d, expect:%d", n, 3*chunk)
776+
}
777+
778+
tmp := v.RegTmp()
779+
780+
p := s.Prog(riscv.AADD)
781+
p.From.Type = obj.TYPE_CONST
782+
p.From.Offset = n - n%chunk
783+
p.Reg = ptr
784+
p.To.Type = obj.TYPE_REG
785+
p.To.Reg = tmp
786+
787+
for i := int64(0); i < 8; i++ {
788+
zeroOp(s, mov, ptr, sz*i)
789+
}
752790

753791
p2 := s.Prog(riscv.AADD)
754792
p2.From.Type = obj.TYPE_CONST
755-
p2.From.Offset = sz
793+
p2.From.Offset = chunk
756794
p2.To.Type = obj.TYPE_REG
757-
p2.To.Reg = v.Args[0].Reg()
795+
p2.To.Reg = ptr
758796

759-
p3 := s.Prog(riscv.ABGEU)
760-
p3.To.Type = obj.TYPE_BRANCH
761-
p3.Reg = v.Args[0].Reg()
797+
p3 := s.Prog(riscv.ABNE)
798+
p3.From.Reg = tmp
762799
p3.From.Type = obj.TYPE_REG
763-
p3.From.Reg = v.Args[1].Reg()
764-
p3.To.SetTarget(p)
800+
p3.Reg = ptr
801+
p3.To.Type = obj.TYPE_BRANCH
802+
p3.To.SetTarget(p.Link)
803+
804+
n %= chunk
805+
806+
// mov ZERO, (offset)(Rarg0)
807+
var off int64
808+
for n >= sz {
809+
zeroOp(s, mov, ptr, off)
810+
off += sz
811+
n -= sz
812+
}
813+
814+
for i := len(fracMovOps) - 1; i >= 0; i-- {
815+
tsz := int64(1 << i)
816+
if n < tsz {
817+
continue
818+
}
819+
zeroOp(s, fracMovOps[i], ptr, off)
820+
off += tsz
821+
n -= tsz
822+
}
765823

766824
case ssa.OpRISCV64LoweredMove:
767825
mov, sz := largestMove(v.AuxInt)
@@ -955,3 +1013,13 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
9551013
p.Pos = p.Pos.WithNotStmt()
9561014
return p
9571015
}
1016+
1017+
func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) {
1018+
p := s.Prog(mov)
1019+
p.From.Type = obj.TYPE_REG
1020+
p.From.Reg = riscv.REG_ZERO
1021+
p.To.Type = obj.TYPE_MEM
1022+
p.To.Reg = reg
1023+
p.To.Offset = off
1024+
return
1025+
}

src/cmd/compile/internal/ssa/_gen/RISCV64.rules

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -373,36 +373,14 @@
373373
(MOVHstore [4] ptr (MOVDconst [0])
374374
(MOVHstore [2] ptr (MOVDconst [0])
375375
(MOVHstore ptr (MOVDconst [0]) mem)))
376-
(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
377-
(MOVWstore [8] ptr (MOVDconst [0])
378-
(MOVWstore [4] ptr (MOVDconst [0])
379-
(MOVWstore ptr (MOVDconst [0]) mem)))
380-
(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
381-
(MOVDstore [8] ptr (MOVDconst [0])
382-
(MOVDstore ptr (MOVDconst [0]) mem))
383-
(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
384-
(MOVDstore [16] ptr (MOVDconst [0])
385-
(MOVDstore [8] ptr (MOVDconst [0])
386-
(MOVDstore ptr (MOVDconst [0]) mem)))
387-
(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 =>
388-
(MOVDstore [24] ptr (MOVDconst [0])
389-
(MOVDstore [16] ptr (MOVDconst [0])
390-
(MOVDstore [8] ptr (MOVDconst [0])
391-
(MOVDstore ptr (MOVDconst [0]) mem))))
392-
393-
// Medium 8-aligned zeroing uses a Duff's device
394-
// 8 and 128 are magic constants, see runtime/mkduff.go
395-
(Zero [s] {t} ptr mem)
396-
&& s%8 == 0 && s <= 8*128
397-
&& t.Alignment()%8 == 0 =>
398-
(DUFFZERO [8 * (128 - s/8)] ptr mem)
376+
377+
// Unroll zeroing in medium size (at most 192 bytes i.e. 3 cachelines)
378+
(Zero [s] {t} ptr mem) && s <= 24*moveSize(t.Alignment(), config) =>
379+
(LoweredZero [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
399380

400381
// Generic zeroing uses a loop
401-
(Zero [s] {t} ptr mem) =>
402-
(LoweredZero [t.Alignment()]
403-
ptr
404-
(ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)]))
405-
mem)
382+
(Zero [s] {t} ptr mem) && s > 24*moveSize(t.Alignment(), config) =>
383+
(LoweredZeroLoop [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
406384

407385
// Checks
408386
(IsNonNil ...) => (SNEZ ...)

src/cmd/compile/internal/ssa/_gen/RISCV64Ops.go

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -317,25 +317,40 @@ func init() {
317317

318318
// Generic moves and zeros
319319

320-
// general unaligned zeroing
321-
// arg0 = address of memory to zero (in X5, changed as side effect)
322-
// arg1 = address of the last element to zero (inclusive)
323-
// arg2 = mem
324-
// auxint = element size
320+
// general unrolled zeroing
321+
// arg0 = address of memory to zero
322+
// arg1 = mem
323+
// auxint = element size and type alignment
325324
// returns mem
326-
// mov ZERO, (X5)
327-
// ADD $sz, X5
328-
// BGEU Rarg1, X5, -2(PC)
325+
// mov ZERO, (OFFSET)(Rarg0)
329326
{
330-
name: "LoweredZero",
331-
aux: "Int64",
332-
argLength: 3,
327+
name: "LoweredZero",
328+
aux: "SymValAndOff",
329+
typ: "Mem",
330+
argLength: 2,
331+
symEffect: "Write",
332+
faultOnNilArg0: true,
333333
reg: regInfo{
334-
inputs: []regMask{regNamed["X5"], gpMask},
335-
clobbers: regNamed["X5"],
334+
inputs: []regMask{gpMask},
336335
},
336+
},
337+
// general unaligned zeroing
338+
// arg0 = address of memory to zero (clobber)
339+
// arg2 = mem
340+
// auxint = element size and type alignment
341+
// returns mem
342+
{
343+
name: "LoweredZeroLoop",
344+
aux: "SymValAndOff",
337345
typ: "Mem",
346+
argLength: 2,
347+
symEffect: "Write",
348+
needIntTemp: true,
338349
faultOnNilArg0: true,
350+
reg: regInfo{
351+
inputs: []regMask{gpMask},
352+
clobbersArg0: true,
353+
},
339354
},
340355

341356
// general unaligned move

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 19 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)