Skip to content

Commit 3406a61

Browse files
committed
internal/bytealg: vector implementation of indexbyte for riscv64
Provide a vector implementation of indexbyte for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are smaller than 24 bytes will continue to use the non-vector path. On a Banana Pi F3, with GORISCV64=rva23u64: │ indexbyte.1 │ indexbyte.2 │ │ sec/op │ sec/op vs base │ IndexByte/10-8 52.68n ± 0% 47.26n ± 0% -10.30% (p=0.000 n=10) IndexByte/32-8 68.62n ± 0% 47.02n ± 0% -31.49% (p=0.000 n=10) IndexByte/4K-8 2217.0n ± 0% 420.4n ± 0% -81.04% (p=0.000 n=10) IndexByte/4M-8 2624.4µ ± 0% 767.5µ ± 0% -70.75% (p=0.000 n=10) IndexByte/64M-8 68.08m ± 10% 47.84m ± 45% -29.73% (p=0.004 n=10) geomean 17.03µ 8.073µ -52.59% │ indexbyte.1 │ indexbyte.2 │ │ B/s │ B/s vs base │ IndexByte/10-8 181.0Mi ± 0% 201.8Mi ± 0% +11.48% (p=0.000 n=10) IndexByte/32-8 444.7Mi ± 0% 649.1Mi ± 0% +45.97% (p=0.000 n=10) IndexByte/4K-8 1.721Gi ± 0% 9.076Gi ± 0% +427.51% (p=0.000 n=10) IndexByte/4M-8 1.488Gi ± 0% 5.089Gi ± 0% +241.93% (p=0.000 n=10) IndexByte/64M-8 940.3Mi ± 9% 1337.8Mi ± 31% +42.27% (p=0.004 n=10) geomean 727.1Mi 1.498Gi +110.94% Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783 Reviewed-on: https://go-review.googlesource.com/c/go/+/648856 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> Reviewed-by: Mark Freeman <[email protected]> Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]>
1 parent 75ea2d0 commit 3406a61

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

src/internal/bytealg/indexbyte_riscv64.s

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
#include "asm_riscv64.h"
56
#include "go_asm.h"
67
#include "textflag.h"
78

@@ -11,12 +12,14 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
1112
// X12 = b_cap (unused)
1213
// X13 = byte to find
1314
AND $0xff, X13, X12 // x12 byte to look for
14-
MOV X10, X13 // store base for later
1515

1616
SLTI $24, X11, X14
17-
ADD X10, X11 // end
18-
BEQZ X14, bigBody
17+
BNEZ X14, small
18+
JMP indexByteBig<>(SB)
1919

20+
small:
21+
MOV X10, X13 // store base for later
22+
ADD X10, X11 // end
2023
SUB $1, X10
2124
loop:
2225
ADD $1, X10
@@ -31,21 +34,19 @@ notfound:
3134
MOV $-1, X10
3235
RET
3336

34-
bigBody:
35-
JMP indexByteBig<>(SB)
36-
3737
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
3838
// X10 = b_base
3939
// X11 = b_len
4040
// X12 = byte to find
41-
4241
AND $0xff, X12 // x12 byte to look for
43-
MOV X10, X13 // store base for later
4442

4543
SLTI $24, X11, X14
46-
ADD X10, X11 // end
47-
BEQZ X14, bigBody
44+
BNEZ X14, small
45+
JMP indexByteBig<>(SB)
4846

47+
small:
48+
MOV X10, X13 // store base for later
49+
ADD X10, X11 // end
4950
SUB $1, X10
5051
loop:
5152
ADD $1, X10
@@ -60,20 +61,41 @@ notfound:
6061
MOV $-1, X10
6162
RET
6263

63-
bigBody:
64-
JMP indexByteBig<>(SB)
65-
6664
TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
67-
// On entry
65+
// On entry:
6866
// X10 = b_base
69-
// X11 = end
67+
// X11 = b_len (at least 16 bytes)
7068
// X12 = byte to find
71-
// X13 = b_base
72-
// X11 is at least 16 bytes > X10
73-
74-
// On exit
69+
// On exit:
7570
// X10 = index of first instance of sought byte, if found, or -1 otherwise
7671

72+
MOV X10, X13 // store base for later
73+
74+
#ifndef hasV
75+
MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
76+
BEQZ X5, indexbyte_scalar
77+
#endif
78+
79+
PCALIGN $16
80+
vector_loop:
81+
VSETVLI X11, E8, M8, TA, MA, X5
82+
VLE8V (X10), V8
83+
VMSEQVX X12, V8, V0
84+
VFIRSTM V0, X6
85+
BGEZ X6, vector_found
86+
ADD X5, X10
87+
SUB X5, X11
88+
BNEZ X11, vector_loop
89+
JMP notfound
90+
91+
vector_found:
92+
SUB X13, X10
93+
ADD X6, X10
94+
RET
95+
96+
indexbyte_scalar:
97+
ADD X10, X11 // end
98+
7799
// Process the first few bytes until we get to an 8 byte boundary
78100
// No need to check for end here as we have at least 16 bytes in
79101
// the buffer.

0 commit comments

Comments
 (0)