Skip to content

Commit c66cb85

Browse files
authored
Indicate float conversion due to overflows (#31)
As flag to tape that indicates that an integer was converted to float due to int64/uint64 limits. Fixes #25
1 parent 3d975b7 commit c66cb85

10 files changed

+204
-79
lines changed

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,28 @@ method to get an iterator.
133133
There are methods that allow you to retrieve all elements as a single type,
134134
[]int64, []uint64, float64 and strings.
135135

136+
## Number parsing
137+
138+
Numbers in JSON are untyped and are returned by the following rules in order:
139+
140+
* If there is any float point notation, like exponents, or a dot notation, it is always returned as float.
141+
* If number is a pure integer and it fits within an int64 it is returned as such.
142+
* If number is a pure positive integer and fits within a uint64 it is returned as such.
143+
* If the number is valid number it is returned as float64.
144+
145+
If the number was converted from integer notation to a float due to not fitting inside int64/uint64
146+
the `FloatOverflowedInteger` flag is set, which can be retrieved using `(Iter).FloatFlags()` method.
147+
148+
JSON numbers follow JavaScript’s double-precision floating-point format.
149+
150+
* Represented in base 10 with no superfluous leading zeros (e.g. 67, 1, 100).
151+
* Include digits between 0 and 9.
152+
* Can be a negative number (e.g. -10).
153+
* Can be a fraction (e.g. .5).
154+
* Can also have an exponent of 10, prefixed by e or E with a plus or minus sign to indicate positive or negative exponentiation.
155+
* Octal and hexadecimal formats are not supported.
156+
* Can not have a value of NaN (Not A Number) or Infinity.
157+
136158
## Parsing NDSJON stream
137159

138160
Newline delimited json is sent as packets with each line being a root element.

parse_json_amd64.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ func (pj *internalParsedJson) initialize(size int) {
4242
pj.Strings = make([]byte, 0, stringsSize)
4343
}
4444
pj.Strings = pj.Strings[:0]
45-
if cap(pj.containing_scope_offset) < maxdepth {
46-
pj.containing_scope_offset = make([]uint64, 0, maxdepth)
45+
if cap(pj.containingScopeOffset) < maxdepth {
46+
pj.containingScopeOffset = make([]uint64, 0, maxdepth)
4747
}
48-
pj.containing_scope_offset = pj.containing_scope_offset[:0]
48+
pj.containingScopeOffset = pj.containingScopeOffset[:0]
4949
}
5050

5151
func (pj *internalParsedJson) parseMessage(msg []byte) error {
@@ -75,8 +75,8 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
7575
// Make the capacity of the channel smaller than the number of slots.
7676
// This way the sender will automatically block until the consumer
7777
// has finished the slot it is working on.
78-
pj.index_chan = make(chan indexChan, indexSlots-2)
79-
pj.buffers_offset = ^uint64(0)
78+
pj.indexChans = make(chan indexChan, indexSlots-2)
79+
pj.buffersOffset = ^uint64(0)
8080

8181
var errStage1 error
8282
go func() {
@@ -89,7 +89,7 @@ func (pj *internalParsedJson) parseMessageInternal(msg []byte, ndjson bool) (err
8989
if !unifiedMachine(pj.Message, pj) {
9090
err = errors.New("Bad parsing while executing stage 2")
9191
// drain the channel until empty
92-
for range pj.index_chan {
92+
for range pj.indexChans {
9393
}
9494
}
9595
wg.Done()

parse_json_amd64_test.go

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func BenchmarkNdjsonStage1(b *testing.B) {
9696

9797
for i := 0; i < b.N; i++ {
9898
// Create new channel (large enough so we won't block)
99-
pj.index_chan = make(chan indexChan, 128*10240)
99+
pj.indexChans = make(chan indexChan, 128*10240)
100100
findStructuralIndices([]byte(ndjson), &pj)
101101
}
102102
}
@@ -210,24 +210,30 @@ func TestParseNumber(t *testing.T) {
210210
expectedD float64
211211
expectedI int64
212212
expectedU uint64
213+
flags FloatFlags
213214
}{
214-
{"1", TagInteger, 0.0, 1, 0},
215-
{"-1", TagInteger, 0.0, -1, 0},
216-
{"10000000000000000000", TagUint, 0.0, 0, 10000000000000000000},
217-
{"10000000000000000001", TagUint, 0.0, 0, 10000000000000000001},
218-
{"-10000000000000000000", TagFloat, -10000000000000000000, 0, 0},
219-
{"1.0", TagFloat, 1.0, 0, 0},
220-
{"1234567890", TagInteger, 0.0, 1234567890, 0},
221-
{"9876.543210", TagFloat, 9876.543210, 0, 0},
222-
{"0.123456789e-12", TagFloat, 1.23456789e-13, 0, 0},
223-
{"1.234567890E+34", TagFloat, 1.234567890e+34, 0, 0},
224-
{"23456789012E66", TagFloat, 23456789012e66, 0, 0},
225-
{"-9876.543210", TagFloat, -9876.543210, 0, 0},
226-
{"-65.619720000000029", TagFloat, -65.61972000000003, 0, 0},
215+
{input: "1", wantTag: TagInteger, expectedI: 1},
216+
{input: "-1", wantTag: TagInteger, expectedI: -1},
217+
{input: "10000000000000000000", wantTag: TagUint, expectedU: 10000000000000000000},
218+
{input: "10000000000000000001", wantTag: TagUint, expectedU: 10000000000000000001},
219+
// math.MinInt64 - 1
220+
{input: "-9223372036854775809", wantTag: TagFloat, expectedD: -9.223372036854776e+18, flags: FloatOverflowedInteger.Flags()},
221+
{input: "-10000000000000000000", wantTag: TagFloat, expectedD: -10000000000000000000, flags: FloatOverflowedInteger.Flags()},
222+
{input: "100000000000000000000", wantTag: TagFloat, expectedD: 100000000000000000000, flags: FloatOverflowedInteger.Flags()},
223+
// math.MaxUint64 +1
224+
{input: "18446744073709551616", wantTag: TagFloat, expectedD: 1.8446744073709552e+19, flags: FloatOverflowedInteger.Flags()},
225+
{input: "1.0", wantTag: TagFloat, expectedD: 1.0},
226+
{input: "1234567890", wantTag: TagInteger, expectedI: 1234567890},
227+
{input: "9876.543210", wantTag: TagFloat, expectedD: 9876.543210},
228+
{input: "0.123456789e-12", wantTag: TagFloat, expectedD: 1.23456789e-13},
229+
{input: "1.234567890E+34", wantTag: TagFloat, expectedD: 1.234567890e+34},
230+
{input: "23456789012E66", wantTag: TagFloat, expectedD: 23456789012e66},
231+
{input: "-9876.543210", wantTag: TagFloat, expectedD: -9876.543210},
232+
{input: "-65.619720000000029", wantTag: TagFloat, expectedD: -65.61972000000003},
227233
}
228234

229235
for _, tc := range testCases {
230-
tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
236+
tag, val, flags := parseNumber([]byte(fmt.Sprintf(`%s:`, tc.input)))
231237
if tag != tc.wantTag {
232238
t.Errorf("TestParseNumber: got: %v want: %v", tag, tc.wantTag)
233239
}
@@ -246,6 +252,9 @@ func TestParseNumber(t *testing.T) {
246252
t.Errorf("TestParseNumber: got: %d want: %d", val, tc.expectedU)
247253
}
248254
}
255+
if flags != uint64(tc.flags) {
256+
t.Errorf("TestParseNumber flags; got: %d want: %d", flags, tc.flags)
257+
}
249258
}
250259
}
251260

@@ -295,7 +304,7 @@ func TestParseInt64(t *testing.T) {
295304
test := &parseInt64Tests[i]
296305
t.Run(test.in, func(t *testing.T) {
297306

298-
tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
307+
tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
299308
if tag != test.tag {
300309
// Ignore intentionally bad syntactical errors
301310
t.Errorf("TestParseInt64: got: %v want: %v", tag, test.tag)
@@ -478,7 +487,7 @@ func TestParseFloat64(t *testing.T) {
478487
for i := 0; i < len(atoftests); i++ {
479488
test := &atoftests[i]
480489
t.Run(test.in, func(t *testing.T) {
481-
tag, val := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
490+
tag, val, _ := parseNumber([]byte(fmt.Sprintf(`%s:`, test.in)))
482491
switch tag {
483492
case TagEnd:
484493
if test.err == nil {

parse_number_amd64.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
package simdjson
2222

2323
import (
24+
"errors"
2425
"math"
2526
"strconv"
2627
)
@@ -63,29 +64,29 @@ var isNumberRune = [256]uint8{
6364
// parseNumber will parse the number starting in the buffer.
6465
// Any non-number characters at the end will be ignored.
6566
// Returns TagEnd if no valid value found be found.
66-
func parseNumber(buf []byte) (tag Tag, val uint64) {
67+
func parseNumber(buf []byte) (tag Tag, val, flags uint64) {
6768
pos := 0
6869
found := uint8(0)
6970
for i, v := range buf {
7071
t := isNumberRune[v]
7172
if t == 0 {
7273
//fmt.Println("aborting on", string(v), "in", string(buf[:i]))
73-
return TagEnd, 0
74+
return TagEnd, 0, 0
7475
}
7576
if t == isEOVFlag {
7677
break
7778
}
7879
if t&isMustHaveDigitNext > 0 {
7980
// A period and minus must be followed by a digit
8081
if len(buf) < i+2 || isNumberRune[buf[i+1]]&isDigitFlag == 0 {
81-
return TagEnd, 0
82+
return TagEnd, 0, 0
8283
}
8384
}
8485
found |= t
8586
pos = i + 1
8687
}
8788
if pos == 0 {
88-
return TagEnd, 0
89+
return TagEnd, 0, 0
8990
}
9091
const maxIntLen = 20
9192

@@ -94,33 +95,42 @@ func parseNumber(buf []byte) (tag Tag, val uint64) {
9495
if found&isMinusFlag == 0 {
9596
if pos > 1 && buf[0] == '0' {
9697
// Integers cannot have a leading zero.
97-
return TagEnd, 0
98+
return TagEnd, 0, 0
9899
}
99100
} else {
100101
if pos > 2 && buf[1] == '0' {
101102
// Integers cannot have a leading zero after minus.
102-
return TagEnd, 0
103+
return TagEnd, 0, 0
103104
}
104105
}
105106
i64, err := strconv.ParseInt(string(buf[:pos]), 10, 64)
106107
if err == nil {
107-
return TagInteger, uint64(i64)
108+
return TagInteger, uint64(i64), 0
108109
}
110+
if errors.Is(err, strconv.ErrRange) {
111+
flags |= uint64(FloatOverflowedInteger)
112+
}
113+
109114
if found&isMinusFlag == 0 {
110115
u64, err := strconv.ParseUint(string(buf[:pos]), 10, 64)
111116
if err == nil {
112-
return TagUint, u64
117+
return TagUint, u64, 0
118+
}
119+
if errors.Is(err, strconv.ErrRange) {
120+
flags |= uint64(FloatOverflowedInteger)
113121
}
114122
}
123+
} else if found&isFloatOnlyFlag == 0 {
124+
flags |= uint64(FloatOverflowedInteger)
115125
}
116126

117127
if pos > 1 && buf[0] == '0' && isNumberRune[buf[1]]&isFloatOnlyFlag == 0 {
118128
// Float can only have have a leading 0 when followed by a period.
119-
return TagEnd, 0
129+
return TagEnd, 0, 0
120130
}
121131
f64, err := strconv.ParseFloat(string(buf[:pos]), 64)
122132
if err == nil {
123-
return TagFloat, math.Float64bits(f64)
133+
return TagFloat, math.Float64bits(f64), flags
124134
}
125-
return TagEnd, 0
135+
return TagEnd, 0, 0
126136
}

parse_number_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ func TestNumberIsValid(t *testing.T) {
3131
// From: https://stackoverflow.com/a/13340826
3232
var jsonNumberRegexp = regexp.MustCompile(`^-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?$`)
3333
isValidNumber := func(s string) bool {
34-
tag, _ := parseNumber([]byte(s))
34+
tag, _, _ := parseNumber([]byte(s))
3535
return tag != TagEnd
3636
}
3737
validTests := []string{

parsed_json.go

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,32 @@ const STRINGBUFMASK = 0x7fffffffffffff
4242

4343
const maxdepth = 128
4444

45+
// FloatFlags are flags recorded when converting floats.
46+
type FloatFlags uint64
47+
48+
// FloatFlag is a flag recorded when parsing floats.
49+
type FloatFlag uint64
50+
51+
const (
52+
// FloatOverflowedInteger is set when number in JSON was in integer notation,
53+
// but under/overflowed both int64 and uint64 and therefore was parsed as float.
54+
FloatOverflowedInteger FloatFlag = 1 << iota
55+
)
56+
57+
// Contains returns whether f contains the specified flag.
58+
func (f FloatFlags) Contains(flag FloatFlag) bool {
59+
return FloatFlag(f)&flag == flag
60+
}
61+
62+
// Flags converts the flag to FloatFlags and optionally merges more flags.
63+
func (f FloatFlag) Flags(more ...FloatFlag) FloatFlags {
64+
// We operate on a copy, so we can modify f.
65+
for _, v := range more {
66+
f |= v
67+
}
68+
return FloatFlags(f)
69+
}
70+
4571
type ParsedJson struct {
4672
Message []byte
4773
Tape []uint64
@@ -63,13 +89,13 @@ type indexChan struct {
6389

6490
type internalParsedJson struct {
6591
ParsedJson
66-
containing_scope_offset []uint64
67-
isvalid bool
68-
index_chan chan indexChan
69-
indexesChan indexChan
70-
buffers [indexSlots][indexSize]uint32
71-
buffers_offset uint64
72-
ndjson uint64
92+
containingScopeOffset []uint64
93+
isvalid bool
94+
indexChans chan indexChan
95+
indexesChan indexChan
96+
buffers [indexSlots][indexSize]uint32
97+
buffersOffset uint64
98+
ndjson uint64
7399
}
74100

75101
// Iter returns a new Iter.
@@ -479,6 +505,34 @@ func (i *Iter) Float() (float64, error) {
479505
}
480506
}
481507

508+
// FloatFlags returns the float value of the next element.
509+
// This will include flags from parsing.
510+
// Integers are automatically converted to float.
511+
func (i *Iter) FloatFlags() (float64, FloatFlags, error) {
512+
switch i.t {
513+
case TagFloat:
514+
if i.off >= len(i.tape.Tape) {
515+
return 0, 0, errors.New("corrupt input: expected float, but no more values on tape")
516+
}
517+
v := math.Float64frombits(i.tape.Tape[i.off])
518+
return v, 0, nil
519+
case TagInteger:
520+
if i.off >= len(i.tape.Tape) {
521+
return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
522+
}
523+
v := int64(i.tape.Tape[i.off])
524+
return float64(v), 0, nil
525+
case TagUint:
526+
if i.off >= len(i.tape.Tape) {
527+
return 0, 0, errors.New("corrupt input: expected integer, but no more values on tape")
528+
}
529+
v := i.tape.Tape[i.off]
530+
return float64(v), FloatFlags(i.cur), nil
531+
default:
532+
return 0, 0, fmt.Errorf("unable to convert type %v to float", i.t)
533+
}
534+
}
535+
482536
// Int returns the integer value of the next element.
483537
// Integers and floats within range are automatically converted.
484538
func (i *Iter) Int() (int64, error) {
@@ -771,6 +825,10 @@ func (pj *ParsedJson) writeTapeTagVal(tag Tag, val uint64) {
771825
pj.Tape = append(pj.Tape, uint64(tag)<<56, val)
772826
}
773827

828+
func (pj *ParsedJson) writeTapeTagValFlags(tag Tag, val, flags uint64) {
829+
pj.Tape = append(pj.Tape, uint64(tag)<<56|flags, val)
830+
}
831+
774832
func (pj *ParsedJson) write_tape_s64(val int64) {
775833
pj.writeTapeTagVal(TagInteger, uint64(val))
776834
}

0 commit comments

Comments
 (0)