Skip to content

Commit e071617

Browse files
sophie-zhaogopherbot
authored andcommitted
cmd/compile: optimize multiplication rules on loong64
Improve multiplication strength reduction, refer to CL 626998, add additional 3 linear combination instructions for loong64. goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI32/120 1.6010n ± 0% 0.8130n ± 0% -49.22% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8109n ± 0% -49.35% (p=0.000 n=10) MulconstI32/65537 1.6275n ± 0% 0.8005n ± 0% -50.81% (p=0.000 n=10) MulconstI32/65538 1.6290n ± 0% 0.8004n ± 0% -50.87% (p=0.000 n=10) MulconstI64/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstI64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstI64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/-120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstI64/65537 1.6270n ± 0% 0.8005n ± 0% -50.80% (p=0.000 n=10) MulconstI64/65538 1.6290n ± 0% 0.8071n ± 1% -50.45% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8004n ± 0% -50.01% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8066n ± 0% -49.62% (p=0.000 n=10) MulconstU32/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU32/65538 1.6280n ± 0% 0.8005n ± 0% -50.83% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6010n ± 0% 0.8005n ± 0% -50.00% (p=0.000 n=10) MulconstU64/65537 1.6290n ± 0% 0.8005n ± 0% -50.86% (p=0.000 n=10) MulconstU64/65538 1.6300n ± 0% 0.8067n ± 0% -50.51% (p=0.000 n=10) geomean 1.609n 0.8537n -46.95% goos: linux goarch: loong64 pkg: cmd/compile/internal/test cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | MulconstI32/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstI32/120 1.6020n ± 0% 0.8012n ± 0% -49.99% (p=0.000 n=10) MulconstI32/-120 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI32/65537 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI32/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/3 1.6015n ± 0% 0.8007n ± 0% -50.00% (p=0.000 n=10) MulconstI64/5 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/12 1.602n ± 0% 1.202n ± 0% -25.00% (p=0.000 n=10) MulconstI64/120 1.6030n ± 0% 0.8011n ± 0% -50.02% (p=0.000 n=10) MulconstI64/-120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstI64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstI64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/3 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/5 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/12 1.601n ± 0% 1.202n ± 0% -24.92% (p=0.000 n=10) MulconstU32/120 1.6010n ± 0% 0.8006n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU32/65538 1.6020n ± 0% 0.8009n ± 0% -50.01% (p=0.000 n=10) MulconstU64/3 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/5 1.6010n ± 0% 0.8007n ± 0% -49.98% (p=0.000 n=10) MulconstU64/12 1.601n ± 0% 1.201n ± 0% -24.98% (p=0.000 n=10) MulconstU64/120 1.6020n ± 0% 0.8007n ± 0% -50.02% (p=0.000 n=10) MulconstU64/65537 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) MulconstU64/65538 1.6010n ± 0% 0.8007n ± 0% -49.99% (p=0.000 n=10) geomean 1.601n 0.8523n -46.77% Change-Id: I9fb0e47ca57875da171a347bf4828adfab41b875 Reviewed-on: https://go-review.googlesource.com/c/go/+/675455 Reviewed-by: Mark Freeman <[email protected]> Reviewed-by: abner chenc <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Keith Randall <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Auto-Submit: Keith Randall <[email protected]>
1 parent eb7f515 commit e071617

File tree

7 files changed

+190
-25
lines changed

7 files changed

+190
-25
lines changed

src/cmd/compile/internal/ssa/_gen/LOONG64.rules

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -750,10 +750,10 @@
750750
(SRLVconst [rc] (MOVBUreg x)) && rc >= 8 => (MOVVconst [0])
751751

752752
// mul by constant
753-
(MULV x (MOVVconst [-1])) => (NEGV x)
754753
(MULV _ (MOVVconst [0])) => (MOVVconst [0])
755754
(MULV x (MOVVconst [1])) => x
756-
(MULV x (MOVVconst [c])) && isPowerOfTwo(c) => (SLLVconst [log64(c)] x)
755+
756+
(MULV x (MOVVconst [c])) && canMulStrengthReduce(config, c) => {mulStrengthReduce(v, x, c)}
757757

758758
// div by constant
759759
(DIVVU x (MOVVconst [1])) => x
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Prefer addition when shifting left by one.
6+
(SLLVconst [1] x) => (ADDV x x)

src/cmd/compile/internal/ssa/config.go

Lines changed: 83 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,8 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo
283283
c.RegSize = 8
284284
c.lowerBlock = rewriteBlockLOONG64
285285
c.lowerValue = rewriteValueLOONG64
286+
c.lateLowerBlock = rewriteBlockLOONG64latelower
287+
c.lateLowerValue = rewriteValueLOONG64latelower
286288
c.registers = registersLOONG64[:]
287289
c.gpRegMask = gpRegMaskLOONG64
288290
c.fpRegMask = fpRegMaskLOONG64
@@ -562,6 +564,43 @@ func (c *Config) buildRecipes(arch string) {
562564
return m.Block.NewValue2I(m.Pos, OpARM64SUBshiftLL, m.Type, int64(i), x, y)
563565
})
564566
}
567+
case "loong64":
568+
// - multiply is 4 cycles.
569+
// - add/sub/shift are 1 cycle.
570+
// On loong64, using a multiply also needs to load the constant into a register.
571+
// TODO: figure out a happy medium.
572+
mulCost = 45
573+
574+
// add
575+
r(1, 1, 10,
576+
func(m, x, y *Value) *Value {
577+
return m.Block.NewValue2(m.Pos, OpLOONG64ADDV, m.Type, x, y)
578+
})
579+
// neg
580+
r(-1, 0, 10,
581+
func(m, x, y *Value) *Value {
582+
return m.Block.NewValue1(m.Pos, OpLOONG64NEGV, m.Type, x)
583+
})
584+
// sub
585+
r(1, -1, 10,
586+
func(m, x, y *Value) *Value {
587+
return m.Block.NewValue2(m.Pos, OpLOONG64SUBV, m.Type, x, y)
588+
})
589+
590+
// regular shifts
591+
for i := 1; i < 64; i++ {
592+
c := 10
593+
if i == 1 {
594+
// Prefer x<<1 over x+x.
595+
// Note that we eventually reverse this decision in LOONG64latelower.rules,
596+
// but this makes shift combining rules in LOONG64.rules simpler.
597+
c--
598+
}
599+
r(1<<i, 0, c,
600+
func(m, x, y *Value) *Value {
601+
return m.Block.NewValue1I(m.Pos, OpLOONG64SLLVconst, m.Type, int64(i), x)
602+
})
603+
}
565604
}
566605

567606
c.mulRecipes = map[int64]mulRecipe{}
@@ -628,17 +667,58 @@ func (c *Config) buildRecipes(arch string) {
628667
}
629668
}
630669

670+
// Currently we only process 3 linear combination instructions for loong64.
671+
if arch == "loong64" {
672+
// Three-instruction recipes.
673+
// D: The first and the second are all single-instruction recipes, and they are also the third's inputs.
674+
// E: The first single-instruction is the second's input, and the second is the third's input.
675+
676+
// D
677+
for _, first := range linearCombos {
678+
for _, second := range linearCombos {
679+
for _, third := range linearCombos {
680+
x := third.a*(first.a+first.b) + third.b*(second.a+second.b)
681+
cost := first.cost + second.cost + third.cost
682+
old := c.mulRecipes[x]
683+
if (old.build == nil || cost < old.cost) && cost < mulCost {
684+
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
685+
v1 := first.build(m, v, v)
686+
v2 := second.build(m, v, v)
687+
return third.build(m, v1, v2)
688+
}}
689+
}
690+
}
691+
}
692+
}
693+
694+
// E
695+
for _, first := range linearCombos {
696+
for _, second := range linearCombos {
697+
for _, third := range linearCombos {
698+
x := third.a*(second.a*(first.a+first.b)+second.b) + third.b
699+
cost := first.cost + second.cost + third.cost
700+
old := c.mulRecipes[x]
701+
if (old.build == nil || cost < old.cost) && cost < mulCost {
702+
c.mulRecipes[x] = mulRecipe{cost: cost, build: func(m, v *Value) *Value {
703+
v1 := first.build(m, v, v)
704+
v2 := second.build(m, v1, v)
705+
return third.build(m, v2, v)
706+
}}
707+
}
708+
}
709+
}
710+
}
711+
}
712+
631713
// These cases should be handled specially by rewrite rules.
632714
// (Otherwise v * 1 == (neg (neg v)))
633715
delete(c.mulRecipes, 0)
634716
delete(c.mulRecipes, 1)
635717

636-
// Currently we assume that it doesn't help to do 3 linear
637-
// combination instructions.
638-
639718
// Currently:
640719
// len(c.mulRecipes) == 5984 on arm64
641720
// 680 on amd64
721+
// 5984 on loong64
642722
// This function takes ~2.5ms on arm64.
643723
//println(len(c.mulRecipes))
644724
}

src/cmd/compile/internal/ssa/rewriteLOONG64.go

Lines changed: 6 additions & 20 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/codegen/arithmetic.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,13 +228,15 @@ func Pow2Muls(n1, n2 int) (int, int) {
228228
// 386:"SHLL\t[$]5",-"IMULL"
229229
// arm:"SLL\t[$]5",-"MUL"
230230
// arm64:"LSL\t[$]5",-"MUL"
231+
// loong64:"SLLV\t[$]5",-"MULV"
231232
// ppc64x:"SLD\t[$]5",-"MUL"
232233
a := n1 * 32
233234

234235
// amd64:"SHLQ\t[$]6",-"IMULQ"
235236
// 386:"SHLL\t[$]6",-"IMULL"
236237
// arm:"SLL\t[$]6",-"MUL"
237238
// arm64:`NEG\sR[0-9]+<<6,\sR[0-9]+`,-`LSL`,-`MUL`
239+
// loong64:"SLLV\t[$]6",-"MULV"
238240
// ppc64x:"SLD\t[$]6","NEG\\sR[0-9]+,\\sR[0-9]+",-"MUL"
239241
b := -64 * n2
240242

@@ -255,11 +257,13 @@ func Mul_96(n int) int {
255257
// 386:`SHLL\t[$]5`,`LEAL\t\(.*\)\(.*\*2\),`,-`IMULL`
256258
// arm64:`LSL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
257259
// arm:`SLL\t[$]5`,`ADD\sR[0-9]+<<1,\sR[0-9]+`,-`MUL`
260+
// loong64:"ADDVU","SLLV\t[$]5",-"MULV"
258261
// s390x:`SLD\t[$]5`,`SLD\t[$]6`,-`MULLD`
259262
return n * 96
260263
}
261264

262265
func Mul_n120(n int) int {
266+
// loong64:"SLLV\t[$]3","SLLV\t[$]7","SUBVU",-"MULV"
263267
// s390x:`SLD\t[$]3`,`SLD\t[$]7`,-`MULLD`
264268
return n * -120
265269
}

0 commit comments

Comments
 (0)