Skip to content

Commit b6cf1d9

Browse files
HeliC829gopherbot
authored andcommitted
runtime: optimize memclr on mips64x
Memclr/5-4 49.94n ± 5% 50.51n ± 1% ~ (p=0.331 n=6) Memclr/16-4 22.71n ± 0% 21.01n ± 2% -7.47% (p=0.002 n=6) Memclr/64-4 49.70n ± 1% 26.09n ± 1% -47.51% (p=0.002 n=6) Memclr/256-4 84.23n ± 3% 44.32n ± 2% -47.38% (p=0.002 n=6) Memclr/4096-4 805.6n ± 1% 220.9n ± 2% -72.57% (p=0.002 n=6) Memclr/65536-4 12.734µ ± 1% 3.287µ ± 1% -74.19% (p=0.002 n=6) Memclr/1M-4 209.1µ ± 0% 105.9µ ± 5% -49.34% (p=0.002 n=6) Memclr/4M-4 838.9µ ± 6% 418.2µ ± 0% -50.15% (p=0.002 n=6) Memclr/8M-4 1.708m ± 4% 1.108m ± 4% -35.15% (p=0.002 n=6) Memclr/16M-4 3.458m ± 1% 2.840m ± 3% -17.88% (p=0.002 n=6) Memclr/64M-4 14.05m ± 0% 11.40m ± 2% -18.87% (p=0.002 n=6) MemclrUnaligned/0_5-4 50.57n ± 2% 51.00n ± 0% ~ (p=0.063 n=6) MemclrUnaligned/0_16-4 48.82n ± 8% 22.39n ± 1% -54.14% (p=0.002 n=6) MemclrUnaligned/0_64-4 52.73n ± 3% 25.29n ± 0% -52.05% (p=0.002 n=6) MemclrUnaligned/0_256-4 88.41n ± 1% 50.04n ± 7% -43.41% (p=0.002 n=6) MemclrUnaligned/0_4096-4 802.2n ± 1% 220.4n ± 1% -72.53% (p=0.002 n=6) MemclrUnaligned/0_65536-4 12.729µ ± 0% 3.341µ ± 6% -73.76% (p=0.002 n=6) MemclrUnaligned/1_5-4 50.52n ± 0% 50.99n ± 6% +0.93% (p=0.002 n=6) MemclrUnaligned/1_16-4 71.23n ± 1% 71.78n ± 1% +0.77% (p=0.041 n=6) MemclrUnaligned/1_64-4 85.11n ± 0% 76.30n ± 1% -10.36% (p=0.002 n=6) MemclrUnaligned/1_256-4 133.50n ± 2% 91.91n ± 1% -31.15% (p=0.002 n=6) MemclrUnaligned/1_4096-4 849.7n ± 0% 291.3n ± 2% -65.72% (p=0.002 n=6) MemclrUnaligned/1_65536-4 12.776µ ± 1% 3.399µ ± 1% -73.40% (p=0.002 n=6) MemclrUnaligned/4_5-4 44.34n ± 0% 44.52n ± 7% +0.41% (p=0.022 n=6) MemclrUnaligned/4_16-4 70.68n ± 0% 71.24n ± 4% ~ (p=0.132 n=6) MemclrUnaligned/4_64-4 81.83n ± 4% 77.98n ± 2% -4.71% (p=0.002 n=6) MemclrUnaligned/4_256-4 121.15n ± 3% 87.58n ± 0% -27.71% (p=0.002 n=6) MemclrUnaligned/4_4096-4 837.0n ± 2% 278.8n ± 3% -66.69% (p=0.002 n=6) MemclrUnaligned/4_65536-4 12.793µ ± 6% 3.373µ ± 3% -73.64% (p=0.002 n=6) MemclrUnaligned/7_5-4 43.89n ± 2% 43.10n ± 0% -1.80% (p=0.002 n=6) MemclrUnaligned/7_16-4 73.59n ± 2% 72.95n ± 1% -0.86% (p=0.006 n=6) MemclrUnaligned/7_64-4 88.67n ± 0% 78.89n ± 1% -11.03% (p=0.002 n=6) MemclrUnaligned/7_256-4 123.90n ± 1% 85.41n ± 2% -31.07% (p=0.002 n=6) MemclrUnaligned/7_4096-4 842.8n ± 2% 268.0n ± 0% -68.20% (p=0.002 n=6) MemclrUnaligned/7_65536-4 12.877µ ± 11% 3.348µ ± 0% -74.00% (p=0.002 n=6) MemclrUnaligned/0_1M-4 208.4µ ± 5% 104.6µ ± 1% -49.80% (p=0.002 n=6) MemclrUnaligned/0_4M-4 836.1µ ± 7% 419.3µ ± 2% -49.85% (p=0.002 n=6) MemclrUnaligned/0_8M-4 1.701m ± 9% 1.136m ± 12% -33.21% (p=0.002 n=6) MemclrUnaligned/0_16M-4 3.467m ± 16% 2.832m ± 4% -18.30% (p=0.002 n=6) MemclrUnaligned/0_64M-4 14.05m ± 2% 11.33m ± 2% -19.38% (p=0.002 n=6) MemclrUnaligned/1_1M-4 208.8µ ± 4% 104.7µ ± 1% -49.85% (p=0.002 n=6) MemclrUnaligned/1_4M-4 838.0µ ± 0% 418.3µ ± 2% -50.09% (p=0.002 n=6) MemclrUnaligned/1_8M-4 1.692m ± 1% 1.108m ± 3% -34.53% (p=0.002 n=6) MemclrUnaligned/1_16M-4 3.463m ± 20% 2.833m ± 6% -18.21% (p=0.002 n=6) MemclrUnaligned/1_64M-4 14.05m ± 4% 11.35m ± 2% -19.28% (p=0.002 n=6) MemclrUnaligned/4_1M-4 209.2µ ± 1% 104.7µ ± 7% -49.94% (p=0.002 n=6) MemclrUnaligned/4_4M-4 836.2µ ± 6% 418.8µ ± 15% -49.91% (p=0.002 n=6) MemclrUnaligned/4_8M-4 1.702m ± 0% 1.123m ± 4% -34.01% (p=0.002 n=6) MemclrUnaligned/4_16M-4 3.476m ± 8% 2.804m ± 2% -19.34% (p=0.002 n=6) MemclrUnaligned/4_64M-4 14.13m ± 25% 11.40m ± 0% -19.33% (p=0.002 n=6) MemclrUnaligned/7_1M-4 208.9µ ± 8% 104.9µ ± 6% -49.81% (p=0.002 n=6) MemclrUnaligned/7_4M-4 845.6µ ± 12% 418.2µ ± 7% -50.54% (p=0.002 n=6) MemclrUnaligned/7_8M-4 1.706m ± 10% 1.101m ± 3% -35.48% (p=0.002 n=6) MemclrUnaligned/7_16M-4 3.466m ± 3% 2.812m ± 2% -18.86% (p=0.002 n=6) MemclrUnaligned/7_64M-4 14.08m ± 5% 11.35m ± 18% -19.37% (p=0.002 n=6) GoMemclr/5-4 49.79n ± 2% 50.34n ± 0% ~ (p=0.394 n=6) GoMemclr/16-4 21.64n ± 0% 22.04n ± 7% +1.85% (p=0.002 n=6) GoMemclr/64-4 47.93n ± 4% 23.77n ± 4% -50.41% (p=0.002 n=6) GoMemclr/256-4 82.77n ± 2% 43.90n ± 0% -46.96% (p=0.002 n=6) Change-Id: I272967d001809ac4948e4118df6cdd0e0661ab96 Reviewed-on: https://go-review.googlesource.com/c/go/+/682195 Reviewed-by: Keith Randall <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Keith Randall <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Auto-Submit: Michael Knyszek <[email protected]>
1 parent a8edd99 commit b6cf1d9

File tree

1 file changed

+76
-12
lines changed

1 file changed

+76
-12
lines changed

src/runtime/memclr_mips64x.s

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,29 +71,93 @@ msa_large_loop:
7171
no_msa:
7272
// if less than 8 bytes, do one byte at a time
7373
SGTU $8, R2, R3
74-
BNE R3, out
74+
BNE R3, check4
7575

76-
// do one byte at a time until 8-aligned
76+
// Check alignment
7777
AND $7, R1, R3
78-
BEQ R3, words
78+
BEQ R3, aligned
79+
80+
// Zero one byte at a time until we reach 8 byte alignment.
81+
MOVV $8, R5
82+
SUBV R3, R5, R3
83+
SUBV R3, R2, R2
84+
align:
85+
SUBV $1, R3
7986
MOVB R0, (R1)
8087
ADDV $1, R1
81-
JMP -4(PC)
88+
BNE R3, align
8289

83-
words:
84-
// do 8 bytes at a time if there is room
85-
ADDV $-7, R4, R2
90+
aligned:
91+
SGTU $8, R2, R3
92+
BNE R3, check4
93+
SGTU $16, R2, R3
94+
BNE R3, zero8
95+
SGTU $32, R2, R3
96+
BNE R3, zero16
97+
SGTU $64, R2, R3
98+
BNE R3, zero32
99+
loop64:
100+
MOVV R0, (R1)
101+
MOVV R0, 8(R1)
102+
MOVV R0, 16(R1)
103+
MOVV R0, 24(R1)
104+
MOVV R0, 32(R1)
105+
MOVV R0, 40(R1)
106+
MOVV R0, 48(R1)
107+
MOVV R0, 56(R1)
108+
ADDV $64, R1
109+
SUBV $64, R2
110+
SGTU $64, R2, R3
111+
BEQ R0, R3, loop64
112+
BEQ R2, done
113+
114+
check32:
115+
SGTU $32, R2, R3
116+
BNE R3, check16
117+
zero32:
118+
MOVV R0, (R1)
119+
MOVV R0, 8(R1)
120+
MOVV R0, 16(R1)
121+
MOVV R0, 24(R1)
122+
ADDV $32, R1
123+
SUBV $32, R2
124+
BEQ R2, done
125+
126+
check16:
127+
SGTU $16, R2, R3
128+
BNE R3, check8
129+
zero16:
130+
MOVV R0, (R1)
131+
MOVV R0, 8(R1)
132+
ADDV $16, R1
133+
SUBV $16, R2
134+
BEQ R2, done
86135

87-
SGTU R2, R1, R3
88-
BEQ R3, out
136+
check8:
137+
SGTU $8, R2, R3
138+
BNE R3, check4
139+
zero8:
89140
MOVV R0, (R1)
90141
ADDV $8, R1
91-
JMP -4(PC)
142+
SUBV $8, R2
143+
BEQ R2, done
92144

93-
out:
145+
check4:
146+
SGTU $4, R2, R3
147+
BNE R3, loop1
148+
zero4:
149+
MOVB R0, (R1)
150+
MOVB R0, 1(R1)
151+
MOVB R0, 2(R1)
152+
MOVB R0, 3(R1)
153+
ADDV $4, R1
154+
SUBV $4, R2
155+
156+
loop1:
94157
BEQ R1, R4, done
95158
MOVB R0, (R1)
96159
ADDV $1, R1
97-
JMP -3(PC)
160+
JMP loop1
98161
done:
99162
RET
163+

0 commit comments

Comments
 (0)