Skip to content

Commit 990af9e

Browse files
committed
common/bitutil: optimize ANDBytes
1 parent 5d51208 commit 990af9e

File tree

5 files changed

+188
-26
lines changed

5 files changed

+188
-26
lines changed

common/bitutil/and_amd64.s

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s
5+
6+
//go:build !purego
7+
8+
#include "textflag.h"
9+
10+
// func andBytesASM(dst, a, b *byte, n int)
11+
TEXT ·andBytesASM(SB), NOSPLIT, $0
12+
MOVQ dst+0(FP), BX
13+
MOVQ a+8(FP), SI
14+
MOVQ b+16(FP), CX
15+
MOVQ n+24(FP), DX
16+
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
17+
JNZ not_aligned
18+
19+
aligned:
20+
MOVQ $0, AX // position in slices
21+
22+
PCALIGN $16
23+
loop16b:
24+
MOVOU (SI)(AX*1), X0 // AND 16byte forwards.
25+
MOVOU (CX)(AX*1), X1
26+
PAND X1, X0
27+
MOVOU X0, (BX)(AX*1)
28+
ADDQ $16, AX
29+
CMPQ DX, AX
30+
JNE loop16b
31+
RET
32+
33+
PCALIGN $16
34+
loop_1b:
35+
SUBQ $1, DX // AND 1byte backwards.
36+
MOVB (SI)(DX*1), DI
37+
MOVB (CX)(DX*1), AX
38+
ANDB AX, DI
39+
MOVB DI, (BX)(DX*1)
40+
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
41+
JNZ loop_1b
42+
CMPQ DX, $0 // if len is 0, ret.
43+
JE ret
44+
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
45+
JZ aligned
46+
47+
not_aligned:
48+
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
49+
JNE loop_1b
50+
SUBQ $8, DX // AND 8bytes backwards.
51+
MOVQ (SI)(DX*1), DI
52+
MOVQ (CX)(DX*1), AX
53+
ANDQ AX, DI
54+
MOVQ DI, (BX)(DX*1)
55+
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
56+
JGE aligned
57+
58+
ret:
59+
RET

common/bitutil/and_arm64.s

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright 2020 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_arm64.s
5+
6+
//go:build !purego
7+
8+
#include "textflag.h"
9+
10+
// func andBytesASM(dst, a, b *byte, n int)
11+
TEXT ·andBytesASM(SB), NOSPLIT|NOFRAME, $0
12+
MOVD dst+0(FP), R0
13+
MOVD a+8(FP), R1
14+
MOVD b+16(FP), R2
15+
MOVD n+24(FP), R3
16+
CMP $64, R3
17+
BLT tail
18+
loop_64:
19+
VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
20+
VLD1.P 64(R2), [V4.B16, V5.B16, V6.B16, V7.B16]
21+
VAND V0.B16, V4.B16, V4.B16
22+
VAND V1.B16, V5.B16, V5.B16
23+
VAND V2.B16, V6.B16, V6.B16
24+
VAND V3.B16, V7.B16, V7.B16
25+
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
26+
SUBS $64, R3
27+
CMP $64, R3
28+
BGE loop_64
29+
tail:
30+
// quick end
31+
CBZ R3, end
32+
TBZ $5, R3, less_than32
33+
VLD1.P 32(R1), [V0.B16, V1.B16]
34+
VLD1.P 32(R2), [V2.B16, V3.B16]
35+
VAND V0.B16, V2.B16, V2.B16
36+
VAND V1.B16, V3.B16, V3.B16
37+
VST1.P [V2.B16, V3.B16], 32(R0)
38+
less_than32:
39+
TBZ $4, R3, less_than16
40+
LDP.P 16(R1), (R11, R12)
41+
LDP.P 16(R2), (R13, R14)
42+
AND R11, R13, R13
43+
AND R12, R14, R14
44+
STP.P (R13, R14), 16(R0)
45+
less_than16:
46+
TBZ $3, R3, less_than8
47+
MOVD.P 8(R1), R11
48+
MOVD.P 8(R2), R12
49+
AND R11, R12, R12
50+
MOVD.P R12, 8(R0)
51+
less_than8:
52+
TBZ $2, R3, less_than4
53+
MOVWU.P 4(R1), R13
54+
MOVWU.P 4(R2), R14
55+
ANDW R13, R14, R14
56+
MOVWU.P R14, 4(R0)
57+
less_than4:
58+
TBZ $1, R3, less_than2
59+
MOVHU.P 2(R1), R15
60+
MOVHU.P 2(R2), R16
61+
ANDW R15, R16, R16
62+
MOVHU.P R16, 2(R0)
63+
less_than2:
64+
TBZ $0, R3, end
65+
MOVBU (R1), R17
66+
MOVBU (R2), R19
67+
ANDW R17, R19, R19
68+
MOVBU R19, (R0)
69+
end:
70+
RET

common/bitutil/and_asm.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_asm.go
5+
6+
//go:build (amd64 || arm64) && !purego
7+
8+
package bitutil
9+
10+
func andBytes(dst, a, b []byte) int {
11+
n := min(len(a), len(b))
12+
if n == 0 {
13+
return 0
14+
}
15+
andBytesASM(&dst[0], &a[0], &b[0], n)
16+
return n
17+
}
18+
19+
//go:noescape
20+
func andBytesASM(dst, a, b *byte, n int)

common/bitutil/and_generic.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Copyright 2013 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build (!amd64 && !arm64) || purego
6+
7+
package bitutil
8+
9+
import "unsafe"
10+
11+
func andBytes(dst, a, b []byte) int {
12+
if supportsUnaligned {
13+
return fastANDBytes(dst, a, b)
14+
}
15+
return safeANDBytes(dst, a, b)
16+
}
17+
18+
// fastANDBytes ands in bulk. It only works on architectures that support
19+
// unaligned read/writes.
20+
func fastANDBytes(dst, a, b []byte) int {
21+
n := len(a)
22+
if len(b) < n {
23+
n = len(b)
24+
}
25+
w := n / wordSize
26+
if w > 0 {
27+
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
28+
aw := *(*[]uintptr)(unsafe.Pointer(&a))
29+
bw := *(*[]uintptr)(unsafe.Pointer(&b))
30+
for i := 0; i < w; i++ {
31+
dw[i] = aw[i] & bw[i]
32+
}
33+
}
34+
for i := n - n%wordSize; i < n; i++ {
35+
dst[i] = a[i] & b[i]
36+
}
37+
return n
38+
}

common/bitutil/bitutil.go

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -62,32 +62,7 @@ func safeXORBytes(dst, a, b []byte) int {
6262
// ANDBytes ands the bytes in a and b. The destination is assumed to have enough
6363
// space. Returns the number of bytes and'd.
6464
func ANDBytes(dst, a, b []byte) int {
65-
if supportsUnaligned {
66-
return fastANDBytes(dst, a, b)
67-
}
68-
return safeANDBytes(dst, a, b)
69-
}
70-
71-
// fastANDBytes ands in bulk. It only works on architectures that support
72-
// unaligned read/writes.
73-
func fastANDBytes(dst, a, b []byte) int {
74-
n := len(a)
75-
if len(b) < n {
76-
n = len(b)
77-
}
78-
w := n / wordSize
79-
if w > 0 {
80-
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
81-
aw := *(*[]uintptr)(unsafe.Pointer(&a))
82-
bw := *(*[]uintptr)(unsafe.Pointer(&b))
83-
for i := 0; i < w; i++ {
84-
dw[i] = aw[i] & bw[i]
85-
}
86-
}
87-
for i := n - n%wordSize; i < n; i++ {
88-
dst[i] = a[i] & b[i]
89-
}
90-
return n
65+
return andBytes(dst, a, b)
9166
}
9267

9368
// safeANDBytes ands one by one. It works on all architectures, independent if

0 commit comments

Comments
 (0)