Skip to content

Commit 75ea2d0

Browse files
committed
internal/bytealg: vector implementation of equal for riscv64
Provide a vector implementation of equal for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are 8 byte aligned will still be handled via a the non-vector code if the length is less than or equal to 64 bytes. On a Banana Pi F3, with GORISCV64=rva23u64: │ equal.1 │ equal.2 │ │ sec/op │ sec/op vs base │ Equal/0-8 1.254n ± 0% 1.254n ± 0% ~ (p=1.000 n=10) Equal/same/1-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.466 n=10) Equal/same/6-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.689 n=10) Equal/same/9-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.861 n=10) Equal/same/15-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.657 n=10) Equal/same/16-8 21.32n ± 0% 21.33n ± 0% ~ (p=0.075 n=10) Equal/same/20-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.249 n=10) Equal/same/32-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.303 n=10) Equal/same/4K-8 21.32n ± 0% 21.32n ± 0% ~ (p=1.000 n=10) Equal/same/4M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.582 n=10) Equal/same/64M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.930 n=10) Equal/1-8 39.16n ± 1% 38.71n ± 0% -1.15% (p=0.000 n=10) Equal/6-8 51.49n ± 1% 50.40n ± 1% -2.12% (p=0.000 n=10) Equal/9-8 54.46n ± 1% 53.89n ± 0% -1.04% (p=0.000 n=10) Equal/15-8 71.81n ± 1% 70.59n ± 0% -1.71% (p=0.000 n=10) Equal/16-8 69.14n ± 0% 68.21n ± 0% -1.34% (p=0.000 n=10) Equal/20-8 78.59n ± 0% 77.59n ± 0% -1.26% (p=0.000 n=10) Equal/32-8 41.55n ± 0% 41.16n ± 0% -0.96% (p=0.000 n=10) Equal/4K-8 925.5n ± 0% 561.4n ± 1% -39.34% (p=0.000 n=10) Equal/4M-8 3.110m ± 32% 2.463m ± 16% -20.80% (p=0.000 n=10) Equal/64M-8 47.34m ± 30% 39.89m ± 16% -15.75% (p=0.004 n=10) EqualBothUnaligned/64_0-8 32.17n ± 1% 32.11n ± 1% ~ (p=0.184 n=10) EqualBothUnaligned/64_1-8 79.48n ± 0% 48.24n ± 1% -39.31% (p=0.000 n=10) EqualBothUnaligned/64_4-8 72.71n ± 0% 48.37n ± 1% -33.48% (p=0.000 n=10) EqualBothUnaligned/64_7-8 77.12n ± 0% 48.16n ± 1% -37.56% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 908.4n ± 0% 562.4n ± 2% -38.09% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 956.6n ± 0% 571.4n ± 3% -40.26% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 949.6n ± 0% 571.6n ± 3% -39.81% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 954.2n ± 0% 571.7n ± 3% -40.09% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 2.935m ± 29% 2.664m ± 19% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 3.341m ± 13% 2.896m ± 34% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 3.204m ± 39% 3.352m ± 33% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 3.226m ± 30% 2.737m ± 34% -15.16% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 49.04m ± 17% 39.94m ± 12% -18.57% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 51.96m ± 15% 42.48m ± 15% -18.23% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 47.67m ± 17% 37.85m ± 41% -20.61% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 53.00m ± 22% 38.76m ± 21% -26.87% (p=0.000 n=10) CompareBytesEqual-8 51.71n ± 1% 52.00n ± 0% +0.57% (p=0.002 n=10) geomean 1.469µ 1.265µ -13.93% │ equal.1 │ equal.2 │ │ B/s │ B/s vs base │ Equal/same/1-8 44.73Mi ± 0% 44.72Mi ± 0% ~ (p=0.426 n=10) Equal/same/6-8 268.3Mi ± 0% 268.4Mi ± 0% ~ (p=0.753 n=10) Equal/same/9-8 402.6Mi ± 0% 402.5Mi ± 0% ~ (p=0.209 n=10) Equal/same/15-8 670.9Mi ± 0% 670.9Mi ± 0% ~ (p=0.724 n=10) Equal/same/16-8 715.6Mi ± 0% 715.4Mi ± 0% -0.04% (p=0.022 n=10) Equal/same/20-8 894.6Mi ± 0% 894.5Mi ± 0% ~ (p=0.060 n=10) Equal/same/32-8 1.398Gi ± 0% 1.398Gi ± 0% ~ (p=0.986 n=10) Equal/same/4K-8 178.9Gi ± 0% 178.9Gi ± 0% ~ (p=0.853 n=10) Equal/same/4M-8 178.9Ti ± 0% 178.9Ti ± 0% ~ (p=0.971 n=10) Equal/same/64M-8 2862.8Ti ± 0% 2862.6Ti ± 0% ~ (p=0.971 n=10) Equal/1-8 24.35Mi ± 1% 24.63Mi ± 0% +1.16% (p=0.000 n=10) Equal/6-8 111.1Mi ± 1% 113.5Mi ± 1% +2.17% (p=0.000 n=10) Equal/9-8 157.6Mi ± 1% 159.3Mi ± 0% +1.05% (p=0.000 n=10) Equal/15-8 199.2Mi ± 1% 202.7Mi ± 0% +1.74% (p=0.000 n=10) Equal/16-8 220.7Mi ± 0% 223.7Mi ± 0% +1.36% (p=0.000 n=10) Equal/20-8 242.7Mi ± 0% 245.8Mi ± 0% +1.27% (p=0.000 n=10) Equal/32-8 734.3Mi ± 0% 741.6Mi ± 0% +0.98% (p=0.000 n=10) Equal/4K-8 4.122Gi ± 0% 6.795Gi ± 1% +64.84% (p=0.000 n=10) Equal/4M-8 1.258Gi ± 24% 1.586Gi ± 14% +26.12% (p=0.000 n=10) Equal/64M-8 1.320Gi ± 23% 1.567Gi ± 14% +18.69% (p=0.004 n=10) EqualBothUnaligned/64_0-8 1.853Gi ± 1% 1.856Gi ± 1% ~ (p=0.190 n=10) EqualBothUnaligned/64_1-8 767.9Mi ± 0% 1265.2Mi ± 1% +64.76% (p=0.000 n=10) EqualBothUnaligned/64_4-8 839.4Mi ± 0% 1261.9Mi ± 1% +50.33% (p=0.000 n=10) EqualBothUnaligned/64_7-8 791.4Mi ± 0% 1267.5Mi ± 1% +60.16% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 4.199Gi ± 0% 6.784Gi ± 2% +61.54% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 3.988Gi ± 0% 6.676Gi ± 3% +67.40% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 4.017Gi ± 0% 6.674Gi ± 3% +66.14% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 3.998Gi ± 0% 6.673Gi ± 3% +66.92% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 1.332Gi ± 22% 1.468Gi ± 16% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 1.169Gi ± 12% 1.350Gi ± 25% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 1.222Gi ± 28% 1.165Gi ± 48% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 1.211Gi ± 23% 1.427Gi ± 26% +17.88% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 1.274Gi ± 14% 1.567Gi ± 14% +22.97% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 1.204Gi ± 14% 1.471Gi ± 13% +22.18% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 1.311Gi ± 14% 1.651Gi ± 29% +25.92% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 1.179Gi ± 18% 1.612Gi ± 17% +36.73% (p=0.000 n=10) geomean 1.870Gi 2.190Gi +17.16% Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34 Reviewed-on: https://go-review.googlesource.com/c/go/+/646736 Reviewed-by: Mark Ryan <[email protected]> Reviewed-by: Meng Zhuo <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Mark Freeman <[email protected]>
1 parent 17a8be7 commit 75ea2d0

File tree

2 files changed

+36
-4
lines changed

2 files changed

+36
-4
lines changed

src/internal/bytealg/bytealg.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,18 @@ import (
1111

1212
// Offsets into internal/cpu records for use in assembly.
1313
const (
14-
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
15-
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
16-
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
14+
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
15+
16+
offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
1717

1818
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
1919
offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
2020

2121
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
2222

23-
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
23+
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
24+
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
25+
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
2426
)
2527

2628
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.

src/internal/bytealg/equal_riscv64.s

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5+
#include "asm_riscv64.h"
56
#include "go_asm.h"
67
#include "textflag.h"
78

@@ -28,6 +29,35 @@ length_check:
2829
MOV $32, X23
2930
BLT X12, X23, loop4_check
3031

32+
#ifndef hasV
33+
MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
34+
BEQZ X5, equal_scalar
35+
#endif
36+
37+
// Use vector if not 8 byte aligned.
38+
OR X10, X11, X5
39+
AND $7, X5
40+
BNEZ X5, vector_loop
41+
42+
// Use scalar if 8 byte aligned and <= 64 bytes.
43+
SUB $64, X12, X6
44+
BLEZ X6, loop32_check
45+
46+
PCALIGN $16
47+
vector_loop:
48+
VSETVLI X12, E8, M8, TA, MA, X5
49+
VLE8V (X10), V8
50+
VLE8V (X11), V16
51+
VMSNEVV V8, V16, V0
52+
VFIRSTM V0, X6
53+
BGEZ X6, done
54+
ADD X5, X10
55+
ADD X5, X11
56+
SUB X5, X12
57+
BNEZ X12, vector_loop
58+
JMP done
59+
60+
equal_scalar:
3161
// Check alignment - if alignment differs we have to do one byte at a time.
3262
AND $7, X10, X9
3363
AND $7, X11, X19

0 commit comments

Comments
 (0)