Skip to content

Commit 7bba745

Browse files
limeidanabner-chenc
authored andcommitted
cmd/compile: use generated loops instead of DUFFZERO on loong64
Change-Id: Id43ee4353d4bac96627f8b0f54545cdd3d2a1d1b Reviewed-on: https://go-review.googlesource.com/c/go/+/699695 Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Carlos Amedee <[email protected]> Reviewed-by: abner chenc <[email protected]>
1 parent 882335e commit 7bba745

File tree

5 files changed

+155
-91
lines changed

5 files changed

+155
-91
lines changed

src/cmd/compile/internal/loong64/ssa.go

Lines changed: 100 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -560,28 +560,97 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
560560
p.To.Sym = ir.Syms.Duffzero
561561
p.To.Offset = v.AuxInt
562562
case ssa.OpLOONG64LoweredZero:
563-
// MOVx R0, (Rarg0)
564-
// ADDV $sz, Rarg0
565-
// BGEU Rarg1, Rarg0, -2(PC)
566-
mov, sz := largestMove(v.AuxInt)
567-
p := s.Prog(mov)
568-
p.From.Type = obj.TYPE_REG
569-
p.From.Reg = loong64.REGZERO
570-
p.To.Type = obj.TYPE_MEM
571-
p.To.Reg = v.Args[0].Reg()
563+
ptrReg := v.Args[0].Reg()
564+
n := v.AuxInt
565+
if n < 16 {
566+
v.Fatalf("Zero too small %d", n)
567+
}
572568

573-
p2 := s.Prog(loong64.AADDVU)
574-
p2.From.Type = obj.TYPE_CONST
575-
p2.From.Offset = sz
576-
p2.To.Type = obj.TYPE_REG
577-
p2.To.Reg = v.Args[0].Reg()
569+
// Generate Zeroing instructions.
570+
var off int64
571+
for n >= 8 {
572+
// MOVV ZR, off(ptrReg)
573+
zero8(s, ptrReg, off)
574+
off += 8
575+
n -= 8
576+
}
577+
if n != 0 {
578+
// MOVV ZR, off+n-8(ptrReg)
579+
zero8(s, ptrReg, off+n-8)
580+
}
581+
case ssa.OpLOONG64LoweredZeroLoop:
582+
ptrReg := v.Args[0].Reg()
583+
countReg := v.RegTmp()
584+
var off int64
585+
n := v.AuxInt
586+
loopSize := int64(64)
587+
if n < 3*loopSize {
588+
// - a loop count of 0 won't work.
589+
// - a loop count of 1 is useless.
590+
// - a loop count of 2 is a code size ~tie
591+
// 4 instructions to implement the loop
592+
// 8 instructions in the loop body
593+
// vs
594+
// 16 instuctions in the straightline code
595+
// Might as well use straightline code.
596+
v.Fatalf("ZeroLoop size tool small %d", n)
597+
}
578598

579-
p3 := s.Prog(loong64.ABGEU)
580-
p3.From.Type = obj.TYPE_REG
581-
p3.From.Reg = v.Args[1].Reg()
582-
p3.Reg = v.Args[0].Reg()
583-
p3.To.Type = obj.TYPE_BRANCH
584-
p3.To.SetTarget(p)
599+
// Put iteration count in a register.
600+
// MOVV $n/loopSize, countReg
601+
p := s.Prog(loong64.AMOVV)
602+
p.From.Type = obj.TYPE_CONST
603+
p.From.Offset = n / loopSize
604+
p.To.Type = obj.TYPE_REG
605+
p.To.Reg = countReg
606+
cntInit := p
607+
608+
// Zero loopSize bytes starting at ptrReg.
609+
for range loopSize / 8 {
610+
// MOVV ZR, off(ptrReg)
611+
zero8(s, ptrReg, off)
612+
off += 8
613+
}
614+
615+
// Increment ptrReg by loopSize.
616+
// ADDV $loopSize, ptrReg
617+
p = s.Prog(loong64.AADDV)
618+
p.From.Type = obj.TYPE_CONST
619+
p.From.Offset = loopSize
620+
p.To.Type = obj.TYPE_REG
621+
p.To.Reg = ptrReg
622+
623+
// Decrement loop count.
624+
// SUBV $1, countReg
625+
p = s.Prog(loong64.ASUBV)
626+
p.From.Type = obj.TYPE_CONST
627+
p.From.Offset = 1
628+
p.To.Type = obj.TYPE_REG
629+
p.To.Reg = countReg
630+
631+
// Jump to loop header if we're not done yet.
632+
// BNE countReg, loop header
633+
p = s.Prog(loong64.ABNE)
634+
p.From.Type = obj.TYPE_REG
635+
p.From.Reg = countReg
636+
p.To.Type = obj.TYPE_BRANCH
637+
p.To.SetTarget(cntInit.Link)
638+
639+
// Multiples of the loop size are now done.
640+
n %= loopSize
641+
642+
off = 0
643+
// Write any fractional portion.
644+
for n >= 8 {
645+
// MOVV ZR, off(ptrReg)
646+
zero8(s, ptrReg, off)
647+
off += 8
648+
n -= 8
649+
}
650+
651+
if n != 0 {
652+
zero8(s, ptrReg, off+n-8)
653+
}
585654

586655
case ssa.OpLOONG64DUFFCOPY:
587656
p := s.Prog(obj.ADUFFCOPY)
@@ -1155,3 +1224,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
11551224
p.Pos = p.Pos.WithNotStmt()
11561225
return p
11571226
}
1227+
1228+
// zero8 zeroes 8 bytes at reg+off.
1229+
func zero8(s *ssagen.State, reg int16, off int64) {
1230+
// MOVV ZR, off(reg)
1231+
p := s.Prog(loong64.AMOVV)
1232+
p.From.Type = obj.TYPE_REG
1233+
p.From.Reg = loong64.REGZERO
1234+
p.To.Type = obj.TYPE_MEM
1235+
p.To.Reg = reg
1236+
p.To.Offset = off
1237+
}

src/cmd/compile/internal/ssa/_gen/LOONG64.rules

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -373,24 +373,8 @@
373373
(MOVVstore [8] ptr (MOVVconst [0])
374374
(MOVVstore ptr (MOVVconst [0]) mem))
375375

376-
// strip off fractional word zeroing
377-
(Zero [s] ptr mem) && s%8 != 0 && s > 16 =>
378-
(Zero [s%8]
379-
(OffPtr <ptr.Type> ptr [s-s%8])
380-
(Zero [s-s%8] ptr mem))
381-
382-
// medium zeroing uses a duff device
383-
(Zero [s] ptr mem)
384-
&& s%8 == 0 && s > 16 && s <= 8*128 =>
385-
(DUFFZERO [8 * (128 - s/8)] ptr mem)
386-
387-
// large zeroing uses a loop
388-
(Zero [s] ptr mem)
389-
&& s%8 == 0 && s > 8*128 =>
390-
(LoweredZero
391-
ptr
392-
(ADDVconst <ptr.Type> ptr [s-8])
393-
mem)
376+
(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
377+
(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
394378

395379
// moves
396380
(Move [0] _ _ mem) => mem

src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,21 @@ func init() {
376376
faultOnNilArg0: true,
377377
},
378378

379+
// medium zeroing
380+
// arg0 = address of memory to zero
381+
// arg1 = mem
382+
// auxint = number of bytes to zero
383+
// returns mem
384+
{
385+
name: "LoweredZero",
386+
aux: "Int64",
387+
argLength: 2,
388+
reg: regInfo{
389+
inputs: []regMask{gp},
390+
},
391+
faultOnNilArg0: true,
392+
},
393+
379394
// duffcopy
380395
// arg0 = address of dst memory (in R21, changed as side effect)
381396
// arg1 = address of src memory (in R20, changed as side effect)
@@ -395,25 +410,21 @@ func init() {
395410
faultOnNilArg1: true,
396411
},
397412

398-
// large or unaligned zeroing
399-
// arg0 = address of memory to zero (in R20, changed as side effect)
400-
// arg1 = address of the last element to zero
401-
// arg2 = mem
402-
// auxint = alignment
413+
// large zeroing
414+
// arg0 = address of memory to zero
415+
// arg1 = mem
416+
// auxint = number of bytes to zero
403417
// returns mem
404-
// MOVx R0, (R20)
405-
// ADDV $sz, R20
406-
// BGEU Rarg1, R20, -2(PC)
407418
{
408-
name: "LoweredZero",
419+
name: "LoweredZeroLoop",
409420
aux: "Int64",
410-
argLength: 3,
421+
argLength: 2,
411422
reg: regInfo{
412-
inputs: []regMask{buildReg("R20"), gp},
413-
clobbers: buildReg("R20"),
423+
inputs: []regMask{gp},
424+
clobbersArg0: true,
414425
},
415-
typ: "Mem",
416426
faultOnNilArg0: true,
427+
needIntTemp: true,
417428
},
418429

419430
// large or unaligned move

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 18 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteLOONG64.go

Lines changed: 11 additions & 34 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)