diff --git a/common/bitutil/and_amd64.s b/common/bitutil/and_amd64.s new file mode 100644 index 000000000000..367c1bc3308e --- /dev/null +++ b/common/bitutil/and_amd64.s @@ -0,0 +1,59 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s + +//go:build !purego + +#include "textflag.h" + +// func andBytesASM(dst, a, b *byte, n int) +TEXT ·andBytesASM(SB), NOSPLIT, $0 + MOVQ dst+0(FP), BX + MOVQ a+8(FP), SI + MOVQ b+16(FP), CX + MOVQ n+24(FP), DX + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slices + + PCALIGN $16 +loop16b: + MOVOU (SI)(AX*1), X0 // AND 16byte forwards. + MOVOU (CX)(AX*1), X1 + PAND X1, X0 + MOVOU X0, (BX)(AX*1) + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + RET + + PCALIGN $16 +loop_1b: + SUBQ $1, DX // AND 1byte backwards. + MOVB (SI)(DX*1), DI + MOVB (CX)(DX*1), AX + ANDB AX, DI + MOVB DI, (BX)(DX*1) + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE ret + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // AND 8bytes backwards. + MOVQ (SI)(DX*1), DI + MOVQ (CX)(DX*1), AX + ANDQ AX, DI + MOVQ DI, (BX)(DX*1) + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + +ret: + RET diff --git a/common/bitutil/bitutil.go b/common/bitutil/bitutil.go index a18a6d18eed8..aead508e715c 100644 --- a/common/bitutil/bitutil.go +++ b/common/bitutil/bitutil.go @@ -149,31 +149,7 @@ func safeORBytes(dst, a, b []byte) int { // TestBytes tests whether any bit is set in the input byte slice. func TestBytes(p []byte) bool { - if supportsUnaligned { - return fastTestBytes(p) - } - return safeTestBytes(p) -} - -// fastTestBytes tests for set bits in bulk. It only works on architectures that -// support unaligned read/writes. -func fastTestBytes(p []byte) bool { - n := len(p) - w := n / wordSize - if w > 0 { - pw := *(*[]uintptr)(unsafe.Pointer(&p)) - for i := 0; i < w; i++ { - if pw[i] != 0 { - return true - } - } - } - for i := n - n%wordSize; i < n; i++ { - if p[i] != 0 { - return true - } - } - return false + return testBytes(p) } // safeTestBytes tests for set bits one byte at a time. It works on all diff --git a/common/bitutil/test_amd64.s b/common/bitutil/test_amd64.s new file mode 100644 index 000000000000..d7c6efcd6de1 --- /dev/null +++ b/common/bitutil/test_amd64.s @@ -0,0 +1,63 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_amd64.s + +//go:build !purego + +#include "textflag.h" + +// func testBytesASM(p *byte, n int) bool +TEXT ·testBytesASM(SB), NOSPLIT, $0 + MOVQ p+0(FP), SI + MOVQ n+8(FP), DX + TESTQ DX, DX // if len is 0, return false + JZ not_found + TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. + JNZ not_aligned + +aligned: + MOVQ $0, AX // position in slice + + PCALIGN $16 +loop16b: + MOVOU (SI)(AX*1), X0 // Load 16 bytes + PTEST X0, X0 // Test if all bits are zero (ZF=1 if all zero) + JNZ found // If any bit is set (ZF=0), jump to found + ADDQ $16, AX + CMPQ DX, AX + JNE loop16b + JMP not_found + + PCALIGN $16 +loop_1b: + SUBQ $1, DX // Test 1 byte backwards. + MOVB (SI)(DX*1), DI + TESTB DI, DI // Test if byte is non-zero + JNZ found + TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. + JNZ loop_1b + CMPQ DX, $0 // if len is 0, ret. + JE not_found + TESTQ $15, DX // AND 15 & len, if zero jump to aligned. + JZ aligned + +not_aligned: + TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. + JNE loop_1b + SUBQ $8, DX // Test 8 bytes backwards. + MOVQ (SI)(DX*1), DI + TESTQ DI, DI // Test if 8 bytes are non-zero + JNZ found + CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. + JGE aligned + JMP not_found + +not_found: + MOVB $0, ret+16(FP) + RET + +found: + MOVB $1, ret+16(FP) + RET + diff --git a/common/bitutil/test_arm64.s b/common/bitutil/test_arm64.s new file mode 100644 index 000000000000..049ca4548457 --- /dev/null +++ b/common/bitutil/test_arm64.s @@ -0,0 +1,68 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_arm64.s + +//go:build !purego + +#include "textflag.h" + +// func testBytesASM(p *byte, n int) bool +TEXT ·testBytesASM(SB), NOSPLIT|NOFRAME, $0 + MOVD p+0(FP), R0 + MOVD n+8(FP), R1 + CMP $64, R1 + BLT tail +loop_64: + VLD1.P 64(R0), [V0.B16, V1.B16, V2.B16, V3.B16] + // OR all vectors together to check if any byte is non-zero + VORR V0.B16, V1.B16, V4.B16 + VORR V2.B16, V3.B16, V5.B16 + VORR V4.B16, V5.B16, V6.B16 + // Check if any byte in V6 is non-zero by checking both 64-bit halves + VMOV V6.D[0], R2 + VMOV V6.D[1], R3 + ORR R2, R3, R2 + CBNZ R2, found + SUBS $64, R1 + CMP $64, R1 + BGE loop_64 +tail: + // quick end + CBZ R1, not_found + TBZ $5, R1, less_than32 + VLD1.P 32(R0), [V0.B16, V1.B16] + VORR V0.B16, V1.B16, V2.B16 + VMOV V2.D[0], R2 + VMOV V2.D[1], R3 + ORR R2, R3, R2 + CBNZ R2, found +less_than32: + TBZ $4, R1, less_than16 + LDP.P 16(R0), (R11, R12) + ORR R11, R12, R2 + CBNZ R2, found +less_than16: + TBZ $3, R1, less_than8 + MOVD.P 8(R0), R11 + CBNZ R11, found +less_than8: + TBZ $2, R1, less_than4 + MOVWU.P 4(R0), R11 + CBNZ R11, found +less_than4: + TBZ $1, R1, less_than2 + MOVHU.P 2(R0), R11 + CBNZ R11, found +less_than2: + TBZ $0, R1, not_found + MOVBU (R0), R11 + CBNZ R11, found +not_found: + MOVD $0, R0 + MOVB R0, ret+16(FP) + RET +found: + MOVD $1, R0 + MOVB R0, ret+16(FP) + RET diff --git a/common/bitutil/test_asm.go b/common/bitutil/test_asm.go new file mode 100644 index 000000000000..2f28d0ed0e14 --- /dev/null +++ b/common/bitutil/test_asm.go @@ -0,0 +1,15 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// inspired by: https://github.com/golang/go/blob/4a3cef2036097d323b6cc0bbe90fc4d8c7588660/src/crypto/internal/fips140/subtle/xor_asm.go + +//go:build (amd64 || arm64) && !purego + +package bitutil + +func testBytes(p []byte) bool { + return testBytesASM(&p[0], len(p)) +} + +//go:noescape +func testBytesASM(p *byte, n int) bool diff --git a/common/bitutil/test_generic.go b/common/bitutil/test_generic.go new file mode 100644 index 000000000000..124d82ee266e --- /dev/null +++ b/common/bitutil/test_generic.go @@ -0,0 +1,37 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build (!amd64 && !arm64) || purego + +package bitutil + +import "unsafe" + +func testBytes(p []byte) bool { + if supportsUnaligned { + return fastTestBytes(p) + } + return safeTestBytes(p) +} + +// fastTestBytes tests for set bits in bulk. It only works on architectures that +// support unaligned read/writes. +func fastTestBytes(p []byte) bool { + n := len(p) + w := n / wordSize + if w > 0 { + pw := *(*[]uintptr)(unsafe.Pointer(&p)) + for i := 0; i < w; i++ { + if pw[i] != 0 { + return true + } + } + } + for i := n - n%wordSize; i < n; i++ { + if p[i] != 0 { + return true + } + } + return false +}