Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# GitHub's Linguist doesn't properly classify many languages
# https://github.com/github-linguist/linguist/blob/main/docs/overrides.md
*.h linguist-language=C
*.c linguist-language=C
*.hpp lingujson-language=C++
*.cpp lingujson-language=C++
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
build/
build_debug/
build_release/
build_go/
build_golang/
build_artifacts*

# Yes, everyone loves keeping this file in the history.
Expand Down
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@
"Needleman",
"newfunc",
"NOARGS",
"nocallback",
"noescape",
"noexcept",
"NOMINMAX",
"NOTIMPLEMENTED",
Expand Down
37 changes: 37 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,43 @@ cargo package --list --allow-dirty

If you want to run benchmarks against third-party implementations, check out the [`ashvardanian/memchr_vs_stringzilla`](https://github.com/ashvardanian/memchr_vs_stringzilla/) repository.

## Contributing in GoLang

First, precompile the C library:

```bash
cmake -D STRINGZILLA_BUILD_SHARED=1 -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=0 -B build_golang
cmake --build build_golang
```

Then, navigate to the GoLang module root directory and run the tests from there:

```bash
cd golang
CGO_CFLAGS="-I$(pwd)/../include" \
CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
go test
```

To benchmark:

```bash
cd golang
CGO_CFLAGS="-I$(pwd)/../include" \
CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
go run ../scripts/bench.go --input ../leipzig1M.txt
```

Alternatively:

```bash
export GO111MODULE="off"
go run scripts/test.go
go run scripts/bench.go
```

## General Recommendations

### Operations Not Worth Optimizing
Expand Down
3 changes: 3 additions & 0 deletions golang/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/ashvardanian/stringzilla/golang

go 1.24
173 changes: 173 additions & 0 deletions golang/lib.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// StringZilla is a SIMD-accelerated string library modern CPUs, written in C 99,
// and using AVX2, AVX512, Arm NEON, and SVE intrinsics to accelerate processing.
//
// The GoLang binding is intended to provide a simple interface to a precompiled
// shared library, available on GitHub: https://github.com/ashvardanian/stringzilla
//
// It requires Go 1.24 or newer to leverage the `cGo` `noescape` and `nocallback`
// directives. Without those the latency of calling C functions from Go is too high
// to be useful for string processing.
//
// Unlike the native Go `strings` package, StringZilla primarily targets byte-level
// binary data processing, with less emphasis on UTF-8 and locale-specific tasks.
package sz

// #cgo CFLAGS: -O3
// #cgo LDFLAGS: -L. -L/usr/local/lib -lstringzilla_shared
// #cgo noescape sz_find
// #cgo nocallback sz_find
// #cgo noescape sz_find_byte
// #cgo nocallback sz_find_byte
// #cgo noescape sz_rfind
// #cgo nocallback sz_rfind
// #cgo noescape sz_rfind_byte
// #cgo nocallback sz_rfind_byte
// #cgo noescape sz_find_char_from
// #cgo nocallback sz_find_char_from
// #cgo noescape sz_rfind_char_from
// #cgo nocallback sz_rfind_char_from
// #define SZ_DYNAMIC_DISPATCH 1
// #include <stringzilla/stringzilla.h>
import "C"
import "unsafe"

// Contains reports whether `substr` is within `str`.
// https://pkg.go.dev/strings#Contains
func Contains(str string, substr string) bool {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
substrLen := len(substr)
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
return matchPtr != nil
}

// Index returns the index of the first instance of `substr` in `str`, or -1 if `substr` is not present.
// https://pkg.go.dev/strings#Index
func Index(str string, substr string) int64 {
substrLen := len(substr)
if substrLen == 0 {
return 0
}
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Index returns the index of the last instance of `substr` in `str`, or -1 if `substr` is not present.
// https://pkg.go.dev/strings#LastIndex
func LastIndex(str string, substr string) int64 {
substrLen := len(substr)
strLen := int64(len(str))
if substrLen == 0 {
return strLen
}
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
matchPtr := unsafe.Pointer(C.sz_rfind(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Index returns the index of the first instance of a byte in `str`, or -1 if a byte is not present.
// https://pkg.go.dev/strings#IndexByte
func IndexByte(str string, c byte) int64 {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
cPtr := (*C.char)(unsafe.Pointer(&c))
matchPtr := unsafe.Pointer(C.sz_find_byte(strPtr, C.ulong(strLen), cPtr))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Index returns the index of the last instance of a byte in `str`, or -1 if a byte is not present.
// https://pkg.go.dev/strings#LastIndexByte
func LastIndexByte(str string, c byte) int64 {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
cPtr := (*C.char)(unsafe.Pointer(&c))
matchPtr := unsafe.Pointer(C.sz_rfind_byte(strPtr, C.ulong(strLen), cPtr))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Index returns the index of the first instance of any byte from `substr` in `str`, or -1 if none are present.
// https://pkg.go.dev/strings#IndexAny
func IndexAny(str string, substr string) int64 {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
substrLen := len(substr)
matchPtr := unsafe.Pointer(C.sz_find_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Index returns the index of the last instance of any byte from `substr` in `str`, or -1 if none are present.
// https://pkg.go.dev/strings#LastIndexAny
func LastIndexAny(str string, substr string) int64 {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := len(str)
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
substrLen := len(substr)
matchPtr := unsafe.Pointer(C.sz_rfind_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
return -1
}
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
}

// Count returns the number of overlapping or non-overlapping instances of `substr` in `str`.
// If `substr` is an empty string, returns 1 + the length of the `str`.
// https://pkg.go.dev/strings#Count
func Count(str string, substr string, overlap bool) int64 {
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
strLen := int64(len(str))
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
substrLen := int64(len(substr))

if strLen == 0 || strLen < substrLen {
return 0
}
if substrLen == 0 {
return 1 + strLen
}

count := int64(0)
if overlap == true {
for strLen > 0 {
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
break
}
count += 1
strLen -= (1 + int64(uintptr(matchPtr)-uintptr(unsafe.Pointer(strPtr))))
strPtr = (*C.char)(unsafe.Add(matchPtr, 1))
}
} else {
for strLen > 0 {
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
if matchPtr == nil {
break
}
count += 1
strLen -= (substrLen + int64(uintptr(matchPtr)-uintptr(unsafe.Pointer(strPtr))))
strPtr = (*C.char)(unsafe.Add(matchPtr, substrLen))
}
}

return count
}
Loading