Skip to content

Commit 762cc41

Browse files
authored
Update benchmarks and add WithCopyStrings option (#36)
Fixes #35
1 parent 66da154 commit 762cc41

File tree

9 files changed

+270
-113
lines changed

9 files changed

+270
-113
lines changed

README.md

Lines changed: 187 additions & 64 deletions
Large diffs are not rendered by default.

benchmarks_test.go

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,35 @@ func benchmarkFromFile(b *testing.B, filename string) {
2929
}
3030
msg := loadCompressed(b, filename)
3131

32-
b.SetBytes(int64(len(msg)))
33-
b.ReportAllocs()
34-
b.ResetTimer()
35-
36-
pj := &ParsedJson{}
37-
38-
for i := 0; i < b.N; i++ {
39-
// Reset tape
40-
var err error
41-
pj, err = Parse(msg, pj)
42-
if err != nil {
43-
b.Fatal(err)
32+
b.Run("copy", func(b *testing.B) {
33+
pj := &ParsedJson{}
34+
b.SetBytes(int64(len(msg)))
35+
b.ReportAllocs()
36+
b.ResetTimer()
37+
for i := 0; i < b.N; i++ {
38+
// Reset tape
39+
var err error
40+
pj, err = Parse(msg, pj, WithCopyStrings(true))
41+
if err != nil {
42+
b.Fatal(err)
43+
}
4444
}
45-
}
45+
})
46+
b.Run("nocopy", func(b *testing.B) {
47+
pj := &ParsedJson{}
48+
b.SetBytes(int64(len(msg)))
49+
b.ReportAllocs()
50+
b.ResetTimer()
51+
for i := 0; i < b.N; i++ {
52+
// Reset tape
53+
var err error
54+
pj, err = Parse(msg, pj, WithCopyStrings(false))
55+
if err != nil {
56+
b.Fatal(err)
57+
}
58+
}
59+
})
60+
4661
}
4762

4863
func BenchmarkApache_builds(b *testing.B) { benchmarkFromFile(b, "apache_builds") }

ndjson_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ func verifyDemoNdjson(pj internalParsedJson, t *testing.T, object int) {
241241
//}
242242
//fmt.Printf("{%s, 0x%x},\n", c, tp&0xffffffffffffff)
243243
expected := tc.expected[ii].val | (uint64(tc.expected[ii].c) << 56)
244-
if !alwaysCopyStrings && tp != expected {
244+
if !pj.copyStrings && tp != expected {
245245
t.Errorf("verifyDemoNdjson(%d): got: %016x want: %016x", ii, tp, expected)
246246
}
247247
}

options.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package simdjson
2+
3+
// ParserOption is a parser option.
4+
type ParserOption func(pj *internalParsedJson) error
5+
6+
// WithCopyStrings will copy strings so they no longer reference the input.
7+
// For enhanced performance, simdjson-go can point back into the original JSON buffer for strings,
8+
// however this can lead to issues in streaming use cases scenarios, or scenarios in which
9+
// the underlying JSON buffer is reused. So the default behaviour is to create copies of all
10+
// strings (not just those transformed anyway for unicode escape characters) into the separate
11+
// Strings buffer (at the expense of using more memory and less performance).
12+
// Default: true - strings are copied.
13+
func WithCopyStrings(b bool) ParserOption {
14+
return func(pj *internalParsedJson) error {
15+
pj.copyStrings = b
16+
return nil
17+
}
18+
}

parse_string_amd64.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ func _parse_string(src, dst, pcurrent_string_buf_loc unsafe.Pointer) (res uint64
3333

3434
// Disable new -d=checkptr behaviour for Go 1.14
3535
//go:nocheckptr
36-
func parseStringSimdValidateOnly(buf []byte, maxStringSize, dst_length *uint64, need_copy *bool) bool {
36+
func parseStringSimdValidateOnly(buf []byte, maxStringSize, dstLength *uint64, needCopy *bool) bool {
3737

3838
src := uintptr(unsafe.Pointer(&buf[1])) // Use buf[1] in order to skip opening quote
3939
src_length := uint64(0)
4040

41-
success := _parse_string_validate_only(unsafe.Pointer(src), unsafe.Pointer(&maxStringSize), unsafe.Pointer(&src_length), unsafe.Pointer(dst_length))
41+
success := _parse_string_validate_only(unsafe.Pointer(src), unsafe.Pointer(&maxStringSize), unsafe.Pointer(&src_length), unsafe.Pointer(dstLength))
4242

43-
*need_copy = alwaysCopyStrings || src_length != *dst_length
43+
*needCopy = *needCopy || src_length != *dstLength
4444
return success != 0
4545
}
4646

parsed_json.go

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,6 @@ import (
2626
"strconv"
2727
)
2828

29-
//
30-
// For enhanced performance, simdjson-go can point back into the original JSON buffer for strings,
31-
// however this can lead to issues in streaming use cases scenarios, or scenarios in which
32-
// the underlying JSON buffer is reused. So the default behaviour is to create copies of all
33-
// strings (not just those transformed anyway for unicode escape characters) into the separate
34-
// Strings buffer (at the expense of using more memory and less performance).
35-
//
36-
const alwaysCopyStrings = true
37-
3829
const JSONVALUEMASK = 0xffffffffffffff
3930
const JSONTAGMASK = 0xff << 56
4031
const STRINGBUFBIT = 0x80000000000000
@@ -96,6 +87,7 @@ type internalParsedJson struct {
9687
buffers [indexSlots][indexSize]uint32
9788
buffersOffset uint64
9889
ndjson uint64
90+
copyStrings bool
9991
}
10092

10193
// Iter returns a new Iter.

simdjson_amd64.go

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,7 @@ func SupportedCPU() bool {
3737
return cpuid.CPU.Supports(cpuid.AVX2, cpuid.CLMUL)
3838
}
3939

40-
// Parse a block of data and return the parsed JSON.
41-
// An optional block of previously parsed json can be supplied to reduce allocations.
42-
func Parse(b []byte, reuse *ParsedJson) (*ParsedJson, error) {
40+
func newInternalParsedJson(reuse *ParsedJson, opts []ParserOption) (*internalParsedJson, error) {
4341
if !SupportedCPU() {
4442
return nil, errors.New("Host CPU does not meet target specs")
4543
}
@@ -53,7 +51,23 @@ func Parse(b []byte, reuse *ParsedJson) (*ParsedJson, error) {
5351
if pj == nil {
5452
pj = &internalParsedJson{}
5553
}
56-
err := pj.parseMessage(b)
54+
pj.copyStrings = true
55+
for _, opt := range opts {
56+
if err := opt(pj); err != nil {
57+
return nil, err
58+
}
59+
}
60+
return pj, nil
61+
}
62+
63+
// Parse a block of data and return the parsed JSON.
64+
// An optional block of previously parsed json can be supplied to reduce allocations.
65+
func Parse(b []byte, reuse *ParsedJson, opts ...ParserOption) (*ParsedJson, error) {
66+
pj, err := newInternalParsedJson(reuse, opts)
67+
if err != nil {
68+
return nil, err
69+
}
70+
err = pj.parseMessage(b)
5771
if err != nil {
5872
return nil, err
5973
}
@@ -64,17 +78,12 @@ func Parse(b []byte, reuse *ParsedJson) (*ParsedJson, error) {
6478

6579
// ParseND will parse newline delimited JSON.
6680
// An optional block of previously parsed json can be supplied to reduce allocations.
67-
func ParseND(b []byte, reuse *ParsedJson) (*ParsedJson, error) {
68-
if !SupportedCPU() {
69-
return nil, errors.New("Host CPU does not meet target specs")
70-
}
71-
var pj internalParsedJson
72-
if reuse != nil {
73-
pj.ParsedJson = *reuse
81+
func ParseND(b []byte, reuse *ParsedJson, opts ...ParserOption) (*ParsedJson, error) {
82+
pj, err := newInternalParsedJson(reuse, opts)
83+
if err != nil {
84+
return nil, err
7485
}
75-
b = bytes.TrimSpace(b)
76-
77-
err := pj.parseMessageNdjson(b)
86+
err = pj.parseMessageNdjson(bytes.TrimSpace(b))
7887
if err != nil {
7988
return nil, err
8089
}
@@ -166,6 +175,7 @@ func ParseNDStream(r io.Reader, res chan<- Stream, reuse <-chan *ParsedJson) {
166175
queue <- result
167176
go func() {
168177
var pj internalParsedJson
178+
pj.copyStrings = true
169179
select {
170180
case v := <-reuse:
171181
if cap(v.Message) >= tmpSize+1024 {

stage2_build_tape_amd64.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,8 @@ func peekSize(pj *internalParsedJson) uint64 {
7171
return uint64(pj.indexesChan.indexes[pj.indexesChan.index])
7272
}
7373

74-
func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64) bool {
74+
func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64, needCopy bool) bool {
7575
size := uint64(0)
76-
need_copy := false
7776
buf := pj.Message[idx:]
7877
// Make sure that we have at least one full YMM word available after maxStringSize into the buffer
7978
if len(buf)-int(maxStringSize) < 64 {
@@ -87,10 +86,10 @@ func parseString(pj *ParsedJson, idx uint64, maxStringSize uint64) bool {
8786
buf = paddedBuf[:]
8887
}
8988
}
90-
if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &need_copy) {
89+
if !parseStringSimdValidateOnly(buf, &maxStringSize, &size, &needCopy) {
9190
return false
9291
}
93-
if !need_copy {
92+
if !needCopy {
9493
pj.write_tape(idx+1, '"')
9594
} else {
9695
// Make sure we account for at least 32 bytes additional space due to
@@ -237,7 +236,7 @@ object_begin:
237236
}
238237
switch buf[idx] {
239238
case '"':
240-
if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
239+
if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
241240
goto fail
242241
}
243242
goto object_key_state
@@ -259,7 +258,7 @@ object_key_state:
259258
}
260259
switch buf[idx] {
261260
case '"':
262-
if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
261+
if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
263262
goto fail
264263
}
265264

@@ -319,7 +318,7 @@ objectContinue:
319318
if buf[idx] != '"' {
320319
goto fail
321320
}
322-
if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
321+
if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
323322
goto fail
324323
}
325324
goto object_key_state
@@ -365,7 +364,7 @@ mainArraySwitch:
365364
// on paths that can accept a close square brace (post-, and at start)
366365
switch buf[idx] {
367366
case '"':
368-
if !parseString(&pj.ParsedJson, idx, peekSize(pj)) {
367+
if !parseString(&pj.ParsedJson, idx, peekSize(pj), pj.copyStrings) {
369368
goto fail
370369
}
371370
case 't':

stage2_build_tape_amd64_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func TestStage2BuildTape(t *testing.T) {
184184
//}
185185
//fmt.Printf("{%s, 0x%x},\n", c, tp&0xffffffffffffff)
186186
expected := tc.expected[ii].val | (uint64(tc.expected[ii].c) << 56)
187-
if !alwaysCopyStrings && tp != expected {
187+
if !pj.copyStrings && tp != expected {
188188
t.Errorf("TestStage2BuildTape(%d): got: %d want: %d", ii, tp, expected)
189189
}
190190
}

0 commit comments

Comments
 (0)