diff --git a/src/json/JsonDecoder.ts b/src/json/JsonDecoder.ts index 1b990b4a..a75f77bc 100644 --- a/src/json/JsonDecoder.ts +++ b/src/json/JsonDecoder.ts @@ -107,7 +107,7 @@ const isUndefined = (u8: Uint8Array, x: number) => const fromCharCode = String.fromCharCode; -const readShortUtf8StrAndUnescape = (reader: Reader): string => { +export const readKey = (reader: Reader): string => { const buf = reader.uint8; const len = buf.length; const points: number[] = []; @@ -202,10 +202,8 @@ export class JsonDecoder implements BinaryJsonDecoder { const uint8 = reader.uint8; const char = uint8[x]; switch (char) { - case 34: { - // " - if (uint8[x + 1] === 0x64) { - // d + case 34 /* " */: { + if (uint8[x + 1] === 0x64 /* d */) { const bin = this.tryReadBin(); if (bin) return bin; if (isUndefined(uint8, x + 2)) { @@ -215,18 +213,18 @@ export class JsonDecoder implements BinaryJsonDecoder { } return this.readStr(); } - case 91: // [ + case 91 /* [ */: return this.readArr(); - case 102: // f + case 102 /* f */: return this.readFalse(); - case 110: // n + case 110 /* n */: return this.readNull(); - case 116: // t + case 116 /* t */: return this.readTrue(); - case 123: // { + case 123 /* { */: return this.readObj(); default: - if ((char >= 48 && char <= 57) || char === 45) return this.readNum(); + if ((char >= 48 /* 0 */ && char <= 57) /* 9 */ || char === 45 /* - */) return this.readNum(); throw new Error('Invalid JSON'); } } @@ -239,10 +237,10 @@ export class JsonDecoder implements BinaryJsonDecoder { while (true) { char = uint8[x]; switch (char) { - case 32: // space - case 9: // tab - case 10: // line feed - case 13: // carriage return + case 32 /* */: + case 9 /* */: + case 10 /* */: + case 13 /* */: x++; continue; default: @@ -253,27 +251,27 @@ export class JsonDecoder implements BinaryJsonDecoder { } public readNull(): null { - if (this.reader.u32() !== 0x6e756c6c) throw new Error('Invalid JSON'); + if (this.reader.u32() !== 0x6e756c6c /* null */) throw new Error('Invalid JSON'); return null; } public readTrue(): true { - if (this.reader.u32() !== 0x74727565) throw new Error('Invalid JSON'); + if (this.reader.u32() !== 0x74727565 /* true */) throw new Error('Invalid JSON'); return true; } public readFalse(): false { const reader = this.reader; - if (reader.u8() !== 0x66 || reader.u32() !== 0x616c7365) throw new Error('Invalid JSON'); + if (reader.u8() !== 0x66 /* f */ || reader.u32() !== 0x616c7365 /* alse */) throw new Error('Invalid JSON'); return false; } public readBool(): unknown { const reader = this.reader; switch (reader.uint8[reader.x]) { - case 102: // f + case 102 /* f */: return this.readFalse(); - case 116: // t + case 116 /* t */: return this.readTrue(); default: throw new Error('Invalid JSON'); @@ -642,42 +640,44 @@ export class JsonDecoder implements BinaryJsonDecoder { public readArr(): unknown[] { const reader = this.reader; - if (reader.u8() !== 0x5b) throw new Error('Invalid JSON'); + if (reader.u8() !== 0x5b /* [ */) throw new Error('Invalid JSON'); const arr: unknown[] = []; const uint8 = reader.uint8; + let first = true; while (true) { this.skipWhitespace(); const char = uint8[reader.x]; - if (char === 0x5d) return reader.x++, arr; // ] - if (char === 0x2c) { - reader.x++; - continue; - } // , + if (char === 0x5d /* ] */) return reader.x++, arr; + if (char === 0x2c /* , */) reader.x++; + else if (!first) throw new Error('Invalid JSON'); + this.skipWhitespace(); arr.push(this.readAny()); + first = false; } } public readObj(): PackValue | Record | unknown { const reader = this.reader; - if (reader.u8() !== 0x7b) throw new Error('Invalid JSON'); + if (reader.u8() !== 0x7b /* { */) throw new Error('Invalid JSON'); const obj: Record = {}; const uint8 = reader.uint8; + let first = true; while (true) { this.skipWhitespace(); let char = uint8[reader.x]; - if (char === 0x7d) return reader.x++, obj; // } - if (char === 0x2c) { - reader.x++; - continue; - } // , + if (char === 0x7d /* } */) return reader.x++, obj; + if (char === 0x2c /* , */) reader.x++; + else if (!first) throw new Error('Invalid JSON'); + this.skipWhitespace(); char = uint8[reader.x++]; - if (char !== 0x22) throw new Error('Invalid JSON'); - const key = readShortUtf8StrAndUnescape(reader); + if (char !== 0x22 /* " */) throw new Error('Invalid JSON'); + const key = readKey(reader); if (key === '__proto__') throw new Error('Invalid JSON'); this.skipWhitespace(); - if (reader.u8() !== 0x3a) throw new Error('Invalid JSON'); + if (reader.u8() !== 0x3a /* : */) throw new Error('Invalid JSON'); this.skipWhitespace(); obj[key] = this.readAny(); + first = false; } } } diff --git a/src/json/JsonDecoderPartial.ts b/src/json/JsonDecoderPartial.ts new file mode 100644 index 00000000..f736bf68 --- /dev/null +++ b/src/json/JsonDecoderPartial.ts @@ -0,0 +1,103 @@ +import {JsonDecoder, readKey} from './JsonDecoder'; +import type {PackValue} from '../types'; + +export class DecodeFinishError extends Error { + constructor(public readonly value: unknown) { + super('DECODE_FINISH'); + } +} + +/** + * This class parses JSON which is mostly correct but not necessarily complete + * or with missing parts. It can be used to parse JSON that is being streamed + * in chunks or JSON output of an LLM model. + * + * If the end of a nested JSON value (array, object) is missing, this parser + * will return the initial correct part for that value, which it was able to + * parse, until the point where the JSON is no longer valid. + * + * Examples: + * + * ```js + * // Missing closing brace + * decoder.readAny('[1, 2, 3'); // [1, 2, 3] + * + * // Trailing comma and missing closing brace + * decoder.readAny('[1, 2, '); // [1, 2] + * + * // Corrupt second element and missing closing brace + * decoder.readAny('{"foo": 1, "bar":'); // {"foo": 1} + * ``` + */ +export class JsonDecoderPartial extends JsonDecoder { + public readAny(): unknown { + try { + return super.readAny(); + } catch (error) { + if (error instanceof DecodeFinishError) return error.value; + throw error; + } + } + + public readArr(): unknown[] { + const reader = this.reader; + if (reader.u8() !== 0x5b /* [ */) throw new Error('Invalid JSON'); + const arr: unknown[] = []; + const uint8 = reader.uint8; + let first = true; + while (true) { + this.skipWhitespace(); + const char = uint8[reader.x]; + if (char === 0x5d /* ] */) return reader.x++, arr; + if (char === 0x2c /* , */) reader.x++; + else if (!first) return arr; + this.skipWhitespace(); + try { + arr.push(this.readAny()); + } catch (error) { + if (error instanceof DecodeFinishError) return arr.push(error.value), arr; + if (error instanceof Error && error.message === 'Invalid JSON') throw new DecodeFinishError(arr); + throw error; + } + first = false; + } + } + + public readObj(): PackValue | Record | unknown { + const reader = this.reader; + if (reader.u8() !== 0x7b /* { */) throw new Error('Invalid JSON'); + const obj: Record = {}; + const uint8 = reader.uint8; + while (true) { + this.skipWhitespace(); + let char = uint8[reader.x]; + if (char === 0x7d /* } */) return reader.x++, obj; + if (char === 0x2c /* , */) { + reader.x++; + continue; + } + try { + char = uint8[reader.x++]; + if (char !== 0x22 /* " */) throw new Error('Invalid JSON'); + const key = readKey(reader); + if (key === '__proto__') throw new Error('Invalid JSON'); + this.skipWhitespace(); + if (reader.u8() !== 0x3a /* : */) throw new Error('Invalid JSON'); + this.skipWhitespace(); + try { + obj[key] = this.readAny(); + } catch (error) { + if (error instanceof DecodeFinishError) { + obj[key] = error.value; + return obj; + } + throw error; + } + } catch (error) { + if (error instanceof DecodeFinishError) return obj; + if (error instanceof Error && error.message === 'Invalid JSON') throw new DecodeFinishError(obj); + throw error; + } + } + } +} diff --git a/src/json/__tests__/JsonDecoder.spec.ts b/src/json/__tests__/JsonDecoder.spec.ts index 29307eb2..d53a429d 100644 --- a/src/json/__tests__/JsonDecoder.spec.ts +++ b/src/json/__tests__/JsonDecoder.spec.ts @@ -322,6 +322,19 @@ describe('array', () => { expect(value).toEqual([1, 2.2, -3.3]); }); + test('simple array', () => { + const data = Buffer.from('[1, 2, 3]', 'utf-8'); + decoder.reader.reset(data); + const value = decoder.readAny(); + expect(value).toEqual([1, 2, 3]); + }); + + test('missing comma', () => { + const data = Buffer.from('[1, 2 3]', 'utf-8'); + decoder.reader.reset(data); + expect(() => decoder.readAny()).toThrow(new Error('Invalid JSON')); + }); + test('nested arrays', () => { const data = Buffer.from(' \n \r \t [[],\n[ 4,\t5] , [null]] \n \r \t ', 'utf-8'); decoder.reader.reset(data); @@ -366,6 +379,19 @@ describe('object', () => { expect(value).toEqual({foo: 'bar'}); }); + test('simple object', () => { + const data = Buffer.from('{"foo": 1, "bar": 2}', 'utf-8'); + decoder.reader.reset(data); + const value = decoder.readAny(); + expect(value).toEqual({foo: 1, bar: 2}); + }); + + test('missing comma', () => { + const data = Buffer.from('{"foo": 1 "bar": 2}', 'utf-8'); + decoder.reader.reset(data); + expect(() => decoder.readAny()).toThrow(new Error('Invalid JSON')); + }); + test('nested object', () => { const data = Buffer.from('{"":{}}', 'utf-8'); decoder.reader.reset(data); diff --git a/src/json/__tests__/JsonDecoderPartial.automated.spec.ts b/src/json/__tests__/JsonDecoderPartial.automated.spec.ts new file mode 100644 index 00000000..17479561 --- /dev/null +++ b/src/json/__tests__/JsonDecoderPartial.automated.spec.ts @@ -0,0 +1,39 @@ +import {Writer} from '@jsonjoy.com/util/lib/buffers/Writer'; +import {JsonValue} from '../../types'; +import {JsonEncoder} from '../JsonEncoder'; +import {JsonEncoderStable} from '../JsonEncoderStable'; +import {JsonDecoderPartial} from '../JsonDecoderPartial'; +import {documents} from '../../__tests__/json-documents'; +import {binaryDocuments} from '../../__tests__/binary-documents'; + +const writer = new Writer(8); +const encoder = new JsonEncoder(writer); +const encoderStable = new JsonEncoderStable(writer); +const decoder = new JsonDecoderPartial(); + +const assertEncoder = (value: JsonValue) => { + const encoded = encoder.encode(value); + const encoded2 = encoderStable.encode(value); + // const json = Buffer.from(encoded).toString('utf-8'); + // console.log('json', json); + const decoded = decoder.decode(encoded); + const decoded2 = decoder.decode(encoded2); + expect(decoded).toEqual(value); + expect(decoded2).toEqual(value); +}; + +describe('Sample JSON documents', () => { + for (const t of documents) { + (t.only ? test.only : test)(t.name, () => { + assertEncoder(t.json as any); + }); + } +}); + +describe('Sample binary documents', () => { + for (const t of binaryDocuments) { + (t.only ? test.only : test)(t.name, () => { + assertEncoder(t.json as any); + }); + } +}); diff --git a/src/json/__tests__/JsonDecoderPartial.spec.ts b/src/json/__tests__/JsonDecoderPartial.spec.ts new file mode 100644 index 00000000..82d74845 --- /dev/null +++ b/src/json/__tests__/JsonDecoderPartial.spec.ts @@ -0,0 +1,145 @@ +import {JsonDecoderPartial} from '../JsonDecoderPartial'; + +const decoder = new JsonDecoderPartial(); +const parse = (text: string) => { + const data = Buffer.from(text, 'utf-8'); + decoder.reader.reset(data); + const value = decoder.readAny(); + return value; +}; + +describe('array', () => { + test('can parse valid array', () => { + const value = parse('[1, 2, 3]'); + expect(value).toEqual([1, 2, 3]); + }); + + test('can parse array with missing closing brace', () => { + const value = parse('[1, 2, 3 '); + expect(value).toEqual([1, 2, 3]); + }); + + test('can parse array with missing closing brace - 2', () => { + const value = parse('[1, 2, 3'); + expect(value).toEqual([1, 2, 3]); + }); + + test('can parse array with trailing comma', () => { + const value = parse('[1, 2, '); + expect(value).toEqual([1, 2]); + }); + + test('can parse array with trailing comma - 2', () => { + const value = parse('[1, 2,'); + expect(value).toEqual([1, 2]); + }); + + test('can parse array with two trailing commas', () => { + const value = parse('[true, "asdf",,'); + expect(value).toEqual([true, 'asdf']); + }); + + test.skip('can parse array with double commas', () => { + const value = parse('[true, "asdf",, 4]'); + expect(value).toEqual([true, 'asdf', 4]); + }); + + test.skip('can parse array with triple commas', () => { + const value = parse('[true, "asdf",, , 4]'); + expect(value).toEqual([true, 'asdf', 4]); + }); + + test('can parse nested arrays', () => { + const value = parse('[[true, false, null]]'); + expect(value).toEqual([[true, false, null]]); + }); + + test('can parse nested arrays with missing brace', () => { + const value = parse('[[true, false, null]'); + expect(value).toEqual([[true, false, null]]); + }); + + test('can parse nested arrays with two missing braces', () => { + const value = parse('[[true, false, null'); + expect(value).toEqual([[true, false, null]]); + }); + + test('can parse nested arrays with two missing element', () => { + const value = parse('[[true, false,'); + expect(value).toEqual([[true, false]]); + }); +}); + +describe('object', () => { + test('can parse valid object', () => { + const value = parse('{"foo": 1, "bar": 2}'); + expect(value).toEqual({foo: 1, bar: 2}); + }); + + test('can parse object with missing brace (trailing space)', () => { + const value = parse('{"foo": 1, "bar": 2 '); + expect(value).toEqual({foo: 1, bar: 2}); + }); + + test('can parse object with missing brace', () => { + const value = parse('{"foo": 1, "bar": 2'); + expect(value).toEqual({foo: 1, bar: 2}); + }); + + test('can parse object with missing field value', () => { + const value1 = parse('{"foo": 1, "bar": '); + const value2 = parse('{"foo": 1, "bar":'); + const value3 = parse('{"foo": 1, "bar"'); + const value4 = parse('{"foo": 1, "bar'); + const value5 = parse('{"foo": 1, "b'); + const value6 = parse('{"foo": 1, "'); + const value7 = parse('{"foo": 1, '); + const value8 = parse('{"foo": 1,'); + const value9 = parse('{"foo": 1'); + expect(value1).toEqual({foo: 1}); + expect(value2).toEqual({foo: 1}); + expect(value3).toEqual({foo: 1}); + expect(value4).toEqual({foo: 1}); + expect(value5).toEqual({foo: 1}); + expect(value6).toEqual({foo: 1}); + expect(value7).toEqual({foo: 1}); + expect(value8).toEqual({foo: 1}); + expect(value9).toEqual({foo: 1}); + }); + + test('can parse nested object', () => { + const value1 = parse('{"a": {"foo": 1, "bar": 2}}'); + const value2 = parse('{"a": {"foo": 1, "bar": 2} }'); + const value3 = parse('{"a": {"foo": 1, "bar": 2} '); + const value4 = parse('{"a": {"foo": 1, "bar": 2}'); + const value5 = parse('{"a": {"foo": 1, "bar": 2 '); + const value6 = parse('{"a": {"foo": 1, "bar": 2'); + expect(value1).toEqual({a: {foo: 1, bar: 2}}); + expect(value2).toEqual({a: {foo: 1, bar: 2}}); + expect(value3).toEqual({a: {foo: 1, bar: 2}}); + expect(value4).toEqual({a: {foo: 1, bar: 2}}); + expect(value5).toEqual({a: {foo: 1, bar: 2}}); + expect(value6).toEqual({a: {foo: 1, bar: 2}}); + }); +}); + +test('simple nested object', () => { + const value = parse('{ "name": { "first": "ind", "last": "go'); + expect(value).toEqual({name: {first: 'ind'}}); +}); + +test('example output from LLM', () => { + const value = parse(` +{ + "name": "Alice", + "age": 25, + "hobbies": ["eat", "drink" + "is_student": false +Some extra text after the JSON with missing closing brace.`); + expect(value).toEqual({ + name: 'Alice', + age: 25, + hobbies: ['eat', 'drink'], + is_student: false, + }); +});