diff --git a/common/bitutil/and_amd64.s b/common/bitutil/and_amd64.s new file mode 100644 index 000000000000..367c1bc3308e --- /dev/null +++ b/common/bitutil/and_amd64.s @@ -0,0 +1,59 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s + +//go:build !purego + +#include "textflag.h" + +// func andBytesASM(dst, a, b *byte, n int) +TEXT ·andBytesASM(SB), NOSPLIT, $0 + MOVQ dst+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + + PCALIGN $16 +loop16b: + MOVOU (SI)(AX*1), X0 // AND 16byte forwards. + MOVOU (CX)(AX*1), X1 + PAND X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + + PCALIGN $16 +loop_1b: + SUBQ $1, DX // AND 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + ANDB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // AND 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + ANDQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET diff --git a/common/bitutil/and_arm64.s b/common/bitutil/and_arm64.s new file mode 100644 index 000000000000..fd15036ab08f --- /dev/null +++ b/common/bitutil/and_arm64.s @@ -0,0 +1,70 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_arm64.s + +//go:build !purego + +#include "textflag.h" + +// func andBytesASM(dst, a, b *byte, n int) +TEXT ·andBytesASM(SB), NOSPLIT|NOFRAME, $0 + MOVD dst+0(FP), R0 + MOVD a+8(FP), R1 + MOVD b+16(FP), R2 + MOVD n+24(FP), R3 + CMP $64, R3 + BLT tail +loop_64: + VLD1.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16] + VLD1.P 64(R2), [V4.B16, V5.B16, V6.B16, V7.B16] + VAND V0.B16, V4.B16, V4.B16 + VAND V1.B16, V5.B16, V5.B16 + VAND V2.B16, V6.B16, V6.B16 + VAND V3.B16, V7.B16, V7.B16 + VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) + SUBS $64, R3 + CMP $64, R3 + BGE loop_64 +tail: + // quick end + CBZ R3, end + TBZ $5, R3, less_than32 + VLD1.P 32(R1), [V0.B16, V1.B16] + VLD1.P 32(R2), [V2.B16, V3.B16] + VAND V0.B16, V2.B16, V2.B16 + VAND V1.B16, V3.B16, V3.B16 + VST1.P [V2.B16, V3.B16], 32(R0) +less_than32: + TBZ $4, R3, less_than16 + LDP.P 16(R1), (R11, R12) + LDP.P 16(R2), (R13, R14) + AND R11, R13, R13 + AND R12, R14, R14 + STP.P (R13, R14), 16(R0) +less_than16: + TBZ $3, R3, less_than8 + MOVD.P 8(R1), R11 + MOVD.P 8(R2), R12 + AND R11, R12, R12 + MOVD.P R12, 8(R0) +less_than8: + TBZ $2, R3, less_than4 + MOVWU.P 4(R1), R13 + MOVWU.P 4(R2), R14 + ANDW R13, R14, R14 + MOVWU.P R14, 4(R0) +less_than4: + TBZ $1, R3, less_than2 + MOVHU.P 2(R1), R15 + MOVHU.P 2(R2), R16 + ANDW R15, R16, R16 + MOVHU.P R16, 2(R0) +less_than2: + TBZ $0, R3, end + MOVBU (R1), R17 + MOVBU (R2), R19 + ANDW R17, R19, R19 + MOVBU R19, (R0) +end: + RET diff --git a/common/bitutil/and_asm.go b/common/bitutil/and_asm.go new file mode 100644 index 000000000000..44a72a0b58ae --- /dev/null +++ b/common/bitutil/and_asm.go @@ -0,0 +1,20 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_asm.go + +//go:build (amd64 || arm64) && !purego + +package bitutil + +func andBytes(dst, a, b []byte) int { + n := min(len(a), len(b)) + if n == 0 { + return 0 + } + andBytesASM(&dst[0], &a[0], &b[0], n) + return n +} + +//go:noescape +func andBytesASM(dst, a, b *byte, n int) diff --git a/common/bitutil/and_generic.go b/common/bitutil/and_generic.go new file mode 100644 index 000000000000..cde901e20baa --- /dev/null +++ b/common/bitutil/and_generic.go @@ -0,0 +1,38 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!amd64 && !arm64) || purego + +package bitutil + +import "unsafe" + +func andBytes(dst, a, b []byte) int { + if supportsUnaligned { + return fastANDBytes(dst, a, b) + } + return safeANDBytes(dst, a, b) +} + +// fastANDBytes ands in bulk. It only works on architectures that support +// unaligned read/writes. +func fastANDBytes(dst, a, b []byte) int { + n := len(a) + if len(b) < n { + n = len(b) + } + w := n / wordSize + if w > 0 { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + for i := 0; i < w; i++ { + dw[i] = aw[i] & bw[i] + } + } + for i := n - n%wordSize; i < n; i++ { + dst[i] = a[i] & b[i] + } + return n +} diff --git a/common/bitutil/bitutil.go b/common/bitutil/bitutil.go index a18a6d18eed8..c27aefabe61b 100644 --- a/common/bitutil/bitutil.go +++ b/common/bitutil/bitutil.go @@ -62,32 +62,7 @@ func safeXORBytes(dst, a, b []byte) int { // ANDBytes ands the bytes in a and b. The destination is assumed to have enough // space. Returns the number of bytes and'd. func ANDBytes(dst, a, b []byte) int { - if supportsUnaligned { - return fastANDBytes(dst, a, b) - } - return safeANDBytes(dst, a, b) -} - -// fastANDBytes ands in bulk. It only works on architectures that support -// unaligned read/writes. -func fastANDBytes(dst, a, b []byte) int { - n := len(a) - if len(b) < n { - n = len(b) - } - w := n / wordSize - if w > 0 { - dw := *(*[]uintptr)(unsafe.Pointer(&dst)) - aw := *(*[]uintptr)(unsafe.Pointer(&a)) - bw := *(*[]uintptr)(unsafe.Pointer(&b)) - for i := 0; i < w; i++ { - dw[i] = aw[i] & bw[i] - } - } - for i := n - n%wordSize; i < n; i++ { - dst[i] = a[i] & b[i] - } - return n + return andBytes(dst, a, b) } // safeANDBytes ands one by one. It works on all architectures, independent if