Skip to content

Commit dd3abf6

Browse files
limeidanabner-chenc
authored andcommitted
internal/bytealg: optimize Index/IndexString on loong64
goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000 @ 2500.00MHz | 3a6000.old.txt | 3a6000.new.txt | | sec/op | sec/op vs base | IndexRune/10 23.56n ± 1% 20.42n ± 0% -13.33% (p=0.000 n=10) IndexRune/32 29.91n ± 1% 22.46n ± 0% -24.90% (p=0.000 n=10) IndexRune/4K 102.45n ± 2% 72.66n ± 0% -29.08% (p=0.000 n=10) IndexRune/4M 111.96µ ± 1% 52.50µ ± 1% -53.11% (p=0.000 n=10) IndexRune/64M 3.653m ± 30% 3.633m ± 0% ~ (p=0.143 n=10) IndexRuneASCII/10 8.736n ± 2% 7.206n ± 0% -17.51% (p=0.000 n=10) IndexRuneASCII/32 10.195n ± 2% 8.008n ± 0% -21.45% (p=0.000 n=10) IndexRuneASCII/4K 70.27n ± 2% 52.84n ± 0% -24.80% (p=0.000 n=10) IndexRuneASCII/4M 98.15µ ± 1% 87.87µ ± 1% -10.47% (p=0.000 n=10) IndexRuneASCII/64M 2.028m ± 0% 1.918m ± 2% -5.41% (p=0.000 n=10) IndexRuneUnicode/Latin/10 18.80n ± 1% 13.61n ± 0% -27.59% (p=0.000 n=10) IndexRuneUnicode/Latin/32 28.09n ± 2% 20.82n ± 0% -25.88% (p=0.000 n=10) IndexRuneUnicode/Latin/4K 373.8n ± 1% 357.1n ± 0% -4.47% (p=0.000 n=10) IndexRuneUnicode/Latin/4M 395.8µ ± 0% 381.0µ ± 0% -3.74% (p=0.000 n=10) IndexRuneUnicode/Latin/64M 8.056m ± 0% 7.614m ± 0% -5.49% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/10 23.72n ± 1% 20.42n ± 0% -13.91% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/32 30.20n ± 1% 22.42n ± 0% -25.77% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/4K 1.134µ ± 1% 1.122µ ± 0% -1.06% (p=0.000 n=10) IndexRuneUnicode/Cyrillic/4M 1.160m ± 1% 1.152m ± 0% -0.72% (p=0.005 n=10) IndexRuneUnicode/Cyrillic/64M 20.26m ± 1% 19.61m ± 0% -3.24% (p=0.000 n=10) IndexRuneUnicode/Han/10 30.11n ± 2% 24.82n ± 0% -17.57% (p=0.000 n=10) IndexRuneUnicode/Han/32 36.16n ± 2% 27.20n ± 0% -24.78% (p=0.000 n=10) IndexRuneUnicode/Han/4K 548.1n ± 0% 524.8n ± 0% -4.25% (p=0.000 n=10) IndexRuneUnicode/Han/4M 706.7µ ± 1% 624.0µ ± 0% -11.70% (p=0.000 n=10) IndexRuneUnicode/Han/64M 12.50m ± 1% 10.84m ± 1% -13.24% (p=0.000 n=10) Index/10 42.03n ± 2% 10.01n ± 0% -76.18% (p=0.000 n=10) Index/32 133.15n ± 1% 40.03n ± 0% -69.94% (p=0.000 n=10) Index/4K 11.647µ ± 1% 2.493µ ± 0% -78.60% (p=0.000 n=10) Index/4M 11.536m ± 0% 2.519m ± 0% -78.16% (p=0.000 n=10) Index/64M 184.60m ± 1% 40.42m ± 0% -78.10% (p=0.000 n=10) IndexEasy/10 17.290n ± 2% 9.608n ± 0% -44.43% (p=0.000 n=10) IndexEasy/32 23.71n ± 2% 16.61n ± 0% -29.95% (p=0.000 n=10) IndexEasy/4K 95.64n ± 2% 68.25n ± 0% -28.64% (p=0.000 n=10) IndexEasy/4M 105.04µ ± 1% 91.94µ ± 0% -12.47% (p=0.000 n=10) IndexEasy/64M 4.280m ± 0% 4.264m ± 0% -0.38% (p=0.002 n=10) Count/10 53.09n ± 1% 16.81n ± 0% -68.33% (p=0.000 n=10) Count/32 142.20n ± 2% 46.44n ± 0% -67.34% (p=0.000 n=10) Count/4K 11.428µ ± 1% 2.500µ ± 1% -78.12% (p=0.000 n=10) Count/4M 11.536m ± 1% 2.520m ± 0% -78.16% (p=0.000 n=10) Count/64M 183.80m ± 1% 40.42m ± 0% -78.01% (p=0.000 n=10) IndexHard1 2906.4µ ± 1% 420.4µ ± 0% -85.54% (p=0.000 n=10) IndexHard2 2918.0µ ± 1% 421.1µ ± 1% -85.57% (p=0.000 n=10) IndexHard3 2912.8µ ± 1% 440.2µ ± 0% -84.89% (p=0.000 n=10) IndexHard4 2909.6µ ± 1% 840.4µ ± 0% -71.12% (p=0.000 n=10) LastIndexHard1 2.939m ± 1% 2.621m ± 0% -10.83% (p=0.000 n=10) LastIndexHard2 2.924m ± 1% 2.624m ± 0% -10.26% (p=0.000 n=10) LastIndexHard3 2.936m ± 1% 2.580m ± 1% -12.12% (p=0.000 n=10) CountHard1 2900.4µ ± 1% 420.0µ ± 0% -85.52% (p=0.000 n=10) CountHard2 2915.6µ ± 1% 420.0µ ± 0% -85.59% (p=0.000 n=10) CountHard3 2905.0µ ± 0% 440.0µ ± 0% -84.85% (p=0.000 n=10) IndexPeriodic/IndexPeriodic2 181.95µ ± 1% 26.28µ ± 0% -85.56% (p=0.000 n=10) IndexPeriodic/IndexPeriodic4 182.59µ ± 1% 26.29µ ± 0% -85.60% (p=0.000 n=10) IndexPeriodic/IndexPeriodic8 183.9µ ± 1% 108.2µ ± 0% -41.14% (p=0.000 n=10) IndexPeriodic/IndexPeriodic16 58.24µ ± 0% 56.58µ ± 0% -2.86% (p=0.000 n=10) IndexPeriodic/IndexPeriodic32 30.82µ ± 0% 29.62µ ± 0% -3.92% (p=0.000 n=10) IndexPeriodic/IndexPeriodic64 16.59µ ± 0% 15.00µ ± 0% -9.62% (p=0.000 n=10) geomean 22.69µ 11.59µ -48.92% Change-Id: Iacc9e686027f99bb0413b566cfc8ee6cd873d2d9 Reviewed-on: https://go-review.googlesource.com/c/go/+/693878 Reviewed-by: abner chenc <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Mark Freeman <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 73ff6d1 commit dd3abf6

File tree

4 files changed

+335
-2
lines changed

4 files changed

+335
-2
lines changed

src/internal/bytealg/index_generic.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
5+
//go:build !amd64 && !arm64 && !loong64 && !s390x && !ppc64le && !ppc64
66

77
package bytealg
88

src/internal/bytealg/index_loong64.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package bytealg
6+
7+
import "internal/cpu"
8+
9+
// Empirical data shows that using Index can get better
10+
// performance when len(s) <= 16.
11+
const MaxBruteForce = 16
12+
13+
func init() {
14+
// If SIMD is supported, optimize the cases where the substring length is less than 64 bytes,
15+
// otherwise, cases the length less than 32 bytes is optimized.
16+
if cpu.Loong64.HasLASX || cpu.Loong64.HasLSX {
17+
MaxLen = 64
18+
} else {
19+
MaxLen = 32
20+
}
21+
}
22+
23+
// Cutover reports the number of failures of IndexByte we should tolerate
24+
// before switching over to Index.
25+
// n is the number of bytes processed so far.
26+
// See the bytes.Index implementation for details.
27+
func Cutover(n int) int {
28+
// 1 error per 8 characters, plus a few slop to start.
29+
return (n + 16) / 8
30+
}

src/internal/bytealg/index_loong64.s

Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
#include "go_asm.h"
6+
#include "textflag.h"
7+
8+
TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
9+
MOVV R7, R6 // R6 = separator pointer
10+
MOVV R8, R7 // R7 = separator length
11+
JMP indexbody<>(SB)
12+
13+
TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
14+
JMP indexbody<>(SB)
15+
16+
// input:
17+
// R4 = string
18+
// R5 = length
19+
// R6 = separator pointer
20+
// R7 = separator length (2 <= len <= 64)
21+
TEXT indexbody<>(SB),NOSPLIT,$0
22+
// main idea is to load 'sep' into separate register(s)
23+
// to avoid repeatedly re-load it again and again
24+
// for sebsequent substring comparisons
25+
SUBV R7, R5, R8
26+
ADDV R4, R8 // R8 contains the start of last substring for comparison
27+
ADDV $1, R4, R9 // store base for later
28+
29+
MOVV $8, R5
30+
BGE R7, R5, len_gt_or_eq_8
31+
len_2_7:
32+
AND $0x4, R7, R5
33+
BNE R5, len_4_7
34+
35+
len_2_3:
36+
AND $0x1, R7, R5
37+
BNE R5, len_3
38+
39+
len_2:
40+
MOVHU (R6), R10
41+
loop_2:
42+
BLT R8, R4, not_found
43+
MOVHU (R4), R11
44+
ADDV $1, R4
45+
BNE R10, R11, loop_2
46+
JMP found
47+
48+
len_3:
49+
MOVHU (R6), R10
50+
MOVBU 2(R6), R11
51+
loop_3:
52+
BLT R8, R4, not_found
53+
MOVHU (R4), R12
54+
ADDV $1, R4
55+
BNE R10, R12, loop_3
56+
MOVBU 1(R4), R13
57+
BNE R11, R13, loop_3
58+
JMP found
59+
60+
len_4_7:
61+
AND $0x2, R7, R5
62+
BNE R5, len_6_7
63+
AND $0x1, R7, R5
64+
BNE R5, len_5
65+
len_4:
66+
MOVWU (R6), R10
67+
loop_4:
68+
BLT R8, R4, not_found
69+
MOVWU (R4), R11
70+
ADDV $1, R4
71+
BNE R10, R11, loop_4
72+
JMP found
73+
74+
len_5:
75+
MOVWU (R6), R10
76+
MOVBU 4(R6), R11
77+
loop_5:
78+
BLT R8, R4, not_found
79+
MOVWU (R4), R12
80+
ADDV $1, R4
81+
BNE R10, R12, loop_5
82+
MOVBU 3(R4), R13
83+
BNE R11, R13, loop_5
84+
JMP found
85+
86+
len_6_7:
87+
AND $0x1, R7, R5
88+
BNE R5, len_7
89+
len_6:
90+
MOVWU (R6), R10
91+
MOVHU 4(R6), R11
92+
loop_6:
93+
BLT R8, R4, not_found
94+
MOVWU (R4), R12
95+
ADDV $1, R4
96+
BNE R10, R12, loop_6
97+
MOVHU 3(R4), R13
98+
BNE R11, R13, loop_6
99+
JMP found
100+
101+
len_7:
102+
MOVWU (R6), R10
103+
MOVWU 3(R6), R11
104+
loop_7:
105+
BLT R8, R4, not_found
106+
MOVWU (R4), R12
107+
ADDV $1, R4
108+
BNE R10, R12, loop_7
109+
MOVWU 2(R4), R13
110+
BNE R11, R13, loop_7
111+
JMP found
112+
113+
len_gt_or_eq_8:
114+
BEQ R5, R7, len_8
115+
MOVV $17, R5
116+
BGE R7, R5, len_gt_or_eq_17
117+
JMP len_9_16
118+
len_8:
119+
MOVV (R6), R10
120+
loop_8:
121+
BLT R8, R4, not_found
122+
MOVV (R4), R11
123+
ADDV $1, R4
124+
BNE R10, R11, loop_8
125+
JMP found
126+
127+
len_9_16:
128+
MOVV (R6), R10
129+
SUBV $8, R7
130+
MOVV (R6)(R7), R11
131+
SUBV $1, R7
132+
loop_9_16:
133+
BLT R8, R4, not_found
134+
MOVV (R4), R12
135+
ADDV $1, R4
136+
BNE R10, R12, loop_9_16
137+
MOVV (R4)(R7), R13
138+
BNE R11, R13, loop_9_16
139+
JMP found
140+
141+
len_gt_or_eq_17:
142+
MOVV $25, R5
143+
BGE R7, R5, len_gt_or_eq_25
144+
len_17_24:
145+
MOVV 0(R6), R10
146+
MOVV 8(R6), R11
147+
SUBV $8, R7
148+
MOVV (R6)(R7), R12
149+
SUBV $1, R7
150+
loop_17_24:
151+
BLT R8, R4, not_found
152+
MOVV (R4), R13
153+
ADDV $1, R4
154+
BNE R10, R13, loop_17_24
155+
MOVV 7(R4), R14
156+
BNE R11, R14, loop_17_24
157+
MOVV (R4)(R7), R15
158+
BNE R12, R15, loop_17_24
159+
JMP found
160+
161+
len_gt_or_eq_25:
162+
MOVV $33, R5
163+
BGE R7, R5, len_gt_or_eq_33
164+
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
165+
BNE R10, lsx_len_25_32
166+
len_25_32:
167+
MOVV 0(R6), R10
168+
MOVV 8(R6), R11
169+
MOVV 16(R6), R12
170+
SUBV $8, R7
171+
MOVV (R6)(R7), R13
172+
SUBV $1, R7
173+
loop_25_32:
174+
BLT R8, R4, not_found
175+
MOVV (R4), R14
176+
ADDV $1, R4
177+
BNE R10, R14, loop_25_32
178+
MOVV 7(R4), R15
179+
BNE R11, R15, loop_25_32
180+
MOVV 15(R4), R16
181+
BNE R12, R16, loop_25_32
182+
MOVV (R4)(R7), R17
183+
BNE R13, R17, loop_25_32
184+
JMP found
185+
186+
// On loong64, LSX is included if LASX is supported.
187+
lasx_len_25_32:
188+
lsx_len_25_32:
189+
VMOVQ 0(R6), V0
190+
SUBV $16, R7
191+
VMOVQ (R6)(R7), V1
192+
SUBV $1, R7
193+
lsx_loop_25_32:
194+
BLT R8, R4, not_found
195+
VMOVQ (R4), V2
196+
ADDV $1, R4
197+
VSEQV V0, V2, V2
198+
VSETANYEQV V2, FCC0
199+
BFPT FCC0, lsx_loop_25_32
200+
201+
VMOVQ (R4)(R7), V3
202+
VSEQV V1, V3, V3
203+
VSETANYEQV V3, FCC1
204+
BFPT FCC1, lsx_loop_25_32
205+
JMP found
206+
207+
len_gt_or_eq_33:
208+
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
209+
MOVV $49, R5
210+
BGE R7, R5, len_gt_or_eq_49
211+
len_33_48:
212+
BNE R10, lasx_len_33_48
213+
JMP lsx_len_33_48
214+
215+
len_gt_or_eq_49:
216+
len_49_64:
217+
BNE R10, lasx_len_49_64
218+
JMP lsx_len_49_64
219+
220+
lsx_len_33_48:
221+
VMOVQ 0(R6), V0
222+
VMOVQ 16(R6), V1
223+
SUBV $16, R7
224+
VMOVQ (R6)(R7), V2
225+
SUBV $1, R7
226+
lsx_loop_33_48:
227+
BLT R8, R4, not_found
228+
VMOVQ 0(R4), V3
229+
ADDV $1, R4
230+
VSEQV V0, V3, V3
231+
VSETANYEQV V3, FCC0
232+
BFPT FCC0, lsx_loop_33_48
233+
234+
VMOVQ 15(R4), V4
235+
VSEQV V1, V4, V4
236+
VSETANYEQV V4, FCC1
237+
BFPT FCC1, lsx_loop_33_48
238+
239+
VMOVQ (R4)(R7), V5
240+
VSEQV V2, V5, V5
241+
VSETANYEQV V5, FCC2
242+
BFPT FCC2, lsx_loop_33_48
243+
JMP found
244+
245+
lsx_len_49_64:
246+
VMOVQ 0(R6), V0
247+
VMOVQ 16(R6), V1
248+
VMOVQ 32(R6), V2
249+
SUBV $16, R7
250+
VMOVQ (R6)(R7), V3
251+
SUBV $1, R7
252+
lsx_loop_49_64:
253+
BLT R8, R4, not_found
254+
VMOVQ 0(R4), V4
255+
ADDV $1, R4
256+
VSEQV V0, V4, V4
257+
VSETANYEQV V4, FCC0
258+
BFPT FCC0, lsx_loop_49_64
259+
260+
VMOVQ 15(R4), V5
261+
VSEQV V1, V5, V5
262+
VSETANYEQV V5, FCC1
263+
BFPT FCC1, lsx_loop_49_64
264+
265+
VMOVQ 31(R4), V6
266+
VSEQV V2, V6, V6
267+
VSETANYEQV V6, FCC2
268+
BFPT FCC2, lsx_loop_49_64
269+
270+
VMOVQ (R4)(R7), V7
271+
VSEQV V3, V7, V7
272+
VSETANYEQV V7, FCC3
273+
BFPT FCC3, lsx_loop_49_64
274+
JMP found
275+
276+
lasx_len_33_48:
277+
lasx_len_49_64:
278+
lasx_len_33_64:
279+
XVMOVQ (R6), X0
280+
SUBV $32, R7
281+
XVMOVQ (R6)(R7), X1
282+
SUBV $1, R7
283+
lasx_loop_33_64:
284+
BLT R8, R4, not_found
285+
XVMOVQ (R4), X2
286+
ADDV $1, R4
287+
XVSEQV X0, X2, X3
288+
XVSETANYEQV X3, FCC0
289+
BFPT FCC0, lasx_loop_33_64
290+
291+
XVMOVQ (R4)(R7), X4
292+
XVSEQV X1, X4, X5
293+
XVSETANYEQV X5, FCC1
294+
BFPT FCC1, lasx_loop_33_64
295+
JMP found
296+
297+
found:
298+
SUBV R9, R4
299+
RET
300+
301+
not_found:
302+
MOVV $-1, R4
303+
RET

src/internal/bytealg/index_native.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build amd64 || arm64 || s390x || ppc64le || ppc64
5+
//go:build amd64 || arm64 || loong64 || s390x || ppc64le || ppc64
66

77
package bytealg
88

0 commit comments

Comments
 (0)