Skip to content

Commit 0ac4c02

Browse files
committed
fix JSONEntry Eq and add documentation
1 parent b494199 commit 0ac4c02

File tree

7 files changed

+93
-41
lines changed

7 files changed

+93
-41
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,27 @@ e.g. to take the existing 1kb JSON paramters, but also support 124-byte keys, us
122122

123123
If you are deriving a key to look up in-circuit and you do not know the maximum length of the key, all query methods have a version with a `_var` suffix (e.g. `JSON::get_string_var`), which accepts the key as a `BoundedVec`
124124

125+
# Architecture
126+
### Overview
127+
The JSON parser uses 5 steps to efficiently parse and index JSON data:
128+
129+
1. **build_transcript** - Convert raw bytes to a transcript of tokens using state machine defined by by JSON_CAPTURE_TABLE. Categorize each character as string, number, ...
130+
2. **capture_missing_tokens & keyswap** - Fix missing tokens and correctly identify keys. Complete a second scan of the tokens, check for missing tokens (e.g.commas after literals), and for strings that are keys to an object, relabel them as keys,
131+
3. **compute_json_packed** - Pack bytes into Field elements for efficient substring extraction
132+
4. **create_json_entries** - Create structured JSON entries with parent-child relationships
133+
5. **compute_keyhash_and_sort_json_entries** - Sort entries by key hash for efficient lookups
134+
135+
### Key Design Patterns
136+
- **Using table lookups**: Uses many lookup tables to avoid branching logic to reduce circuit size
137+
- **Packing data to Field elements**: Combines multiple fields that encodes different features into a single Field element for comparison
138+
139+
### Table Generation
140+
The parser uses several lookup tables generated from `src/_table_generation/`:
141+
- `TOKEN_FLAGS_TABLE`: State transitions for token processing
142+
- `JSON_CAPTURE_TABLE`: Character-by-character parsing rules
143+
- `TOKEN_VALIDATION_TABLE`: JSON grammar validation
144+
145+
125146
# Acknowledgements
126147

127148
Many thanks to the authors of the OG noir json library https://github.com/RontoSOFT/noir-json-parser

src/_table_generation/make_tables.nr

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
384384
let mut flags: [TokenFlags; NUM_TOKENS * 2] = [TokenFlags::default(); NUM_TOKENS * 2];
385385

386386
let mut no_token_flags: TokenFlags = TokenFlags {
387-
create_json_entry: 0,
387+
create_json_entry: false,
388388
is_end_of_object_or_array: 0,
389389
is_start_of_object_or_array: 0,
390390
new_context: OBJECT_LAYER as Field,
@@ -393,7 +393,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
393393
preserve_num_entries: 1,
394394
};
395395
let mut key_token_flags: TokenFlags = TokenFlags {
396-
create_json_entry: 0,
396+
create_json_entry: false,
397397
is_end_of_object_or_array: 0,
398398
is_start_of_object_or_array: 0,
399399
new_context: OBJECT_LAYER as Field,
@@ -402,7 +402,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
402402
preserve_num_entries: 1,
403403
};
404404
let begin_object_flags = TokenFlags {
405-
create_json_entry: 0,
405+
create_json_entry: false,
406406
is_end_of_object_or_array: 0,
407407
is_start_of_object_or_array: 1,
408408
new_context: OBJECT_LAYER as Field,
@@ -412,7 +412,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
412412
};
413413

414414
let begin_array_flags = TokenFlags {
415-
create_json_entry: 0,
415+
create_json_entry: false,
416416
is_end_of_object_or_array: 0,
417417
is_start_of_object_or_array: 1,
418418
new_context: ARRAY_LAYER as Field,
@@ -422,7 +422,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
422422
};
423423

424424
let mut end_object_flags = TokenFlags {
425-
create_json_entry: 1,
425+
create_json_entry: true,
426426
is_end_of_object_or_array: 1,
427427
is_start_of_object_or_array: 0,
428428
new_context: 0,
@@ -432,7 +432,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
432432
};
433433

434434
let mut end_array_flags = TokenFlags {
435-
create_json_entry: 1,
435+
create_json_entry: true,
436436
is_end_of_object_or_array: 1,
437437
is_start_of_object_or_array: 0,
438438
new_context: 0,
@@ -442,7 +442,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
442442
};
443443

444444
let mut string_flags = TokenFlags {
445-
create_json_entry: 1,
445+
create_json_entry: true,
446446
is_end_of_object_or_array: 0,
447447
is_start_of_object_or_array: 0,
448448
new_context: OBJECT_LAYER as Field,
@@ -452,7 +452,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
452452
};
453453

454454
let mut numeric_flags = TokenFlags {
455-
create_json_entry: 1,
455+
create_json_entry: true,
456456
is_end_of_object_or_array: 0,
457457
is_start_of_object_or_array: 0,
458458
new_context: OBJECT_LAYER as Field,
@@ -462,7 +462,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
462462
};
463463

464464
let mut literal_flags = TokenFlags {
465-
create_json_entry: 1,
465+
create_json_entry: true,
466466
is_end_of_object_or_array: 0,
467467
is_start_of_object_or_array: 0,
468468
new_context: OBJECT_LAYER as Field,
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Table Generation Documentation
2+
3+
## Overview
4+
The JSON parser uses lookup tables to avoid branching logic and reduce gate count. These tables are generated from `src/_table_generation/make_tables.nr`.
5+
6+
## Generation Process
7+
Tables are generated by simulating all possible input combinations from basic hardcoded tables and recording the expected outputs.
8+
9+
## TOKEN_FLAGS_TABLE
10+
Maps (token, context) pairs to parsing flags:
11+
- `create_json_entry`: Whether to create a JSON entry for this token, set to true if token is literal/number/string(not key)/end of array/object
12+
- `is_end_of_object_or_array`: Whether token ends an object/array
13+
- `is_start_of_object_or_array`: Whether token starts an object/array
14+
- `new_context`: What context to switch to, object is 0, array is 1
15+
- `is_key_token`: Whether token is a key
16+
- `is_value_token`: Whether token is a value, set to True for string_token, numeric_token, and literal_token
17+
- `preserve_num_entries`: boolean flag that controls whether the current token should preserve the existing count of entries at the current depth or reset/increment it. 1 for tokens like NO_TOKEN, KEY_TOKEN, STRING_TOKEN, NUMERIC_TOKEN, LITERAL_TOKEN
18+
0 for tokens like OBJECT_START_TOKEN, ARRAY_START_TOKEN, OBJECT_END_TOKEN, ARRAY_END_TOKEN
19+
20+
## JSON_CAPTURE_TABLE
21+
Maps (escape_flag, scan_mode, ascii) to scanning actions:
22+
- `scan_token`: Next capture mode, can be grammar_capture([,{,comma,},],:)/string_capture/literal_capture/numeric_capture
23+
- `push_transcript`: Whether to add token to transcript: in grammar mode: true for all structual elements[,{,comma,},],:. In string_capture, true for ", which signals string end. In numeric/literal_capture, true for space, \t, \n, \r, ", and comma. Note the first scan will not pick up numerics or literals because we don't know when they end, so we need to rely on capture_missing_tokens function.
24+
- `increase_length`: Whether to extend current token, always false for grammar_capture, true for 0-9 in numeric capture, all characters except for " in string_capture, all letters in true, false, null in literal_capture
25+
- `is_potential_escape_sequence`: if current token is /

src/json.nr

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
7777
}
7878
}
7979

80-
unconstrained fn __check_entry_ptr_bounds(entry_ptr: Field, max: u32) {
81-
entry_ptr.assert_max_bit_size::<32>();
80+
unconstrained fn __check_entry_ptr_bounds(entry_ptr: u32, max: u32) {
8281
// n.b. even though this assert is in an unconstrained function, an out of bounds error will be triggered when writing into self.key_data[entry_ptr]
8382
assert(entry_ptr as u32 < max - 1, "create_json_entries: MaxNumValues limit exceeded!");
8483
}
@@ -244,15 +243,18 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
244243
* 7. Is the token one that we should skip over? `,` or `:`
245244
**/
246245
fn create_json_entries(&mut self) {
247-
let mut entry_ptr = 0;
246+
let mut entry_ptr: u32 = 0;
248247
let mut depth: Field = 1;
249248
let mut num_entries_at_current_depth: Field = 0;
250249
let mut next_identity_value: Field = 1;
251250
let mut current_identity_value: Field = 0;
251+
// context: 0 for object, 1 for array
252252
let mut context = OBJECT_LAYER;
253253

254+
//lower 2 bytes is the index, upper 2 bytes is the length
254255
let mut current_key_index_and_length: Field = 0;
255256

257+
//stack won't pop elements, but will push new elements by overwriting the top element
256258
let mut parent_context_stack: [Field; 32] = [0; 32];
257259
let mut tokens: [Field; MaxNumTokens] = [0; MaxNumTokens];
258260
// maybe 71.75 gates per iteration
@@ -270,16 +272,17 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
270272
is_end_of_object_or_array,
271273
is_start_of_object_or_array,
272274
new_context,
273-
is_key_token: update_key,
275+
is_key_token,
274276
is_value_token,
275277
preserve_num_entries,} = TokenFlags::from_field(
276278
TOKEN_FLAGS_TABLE[cast_num_to_u32(token) + context * NUM_TOKENS],
277279
);
278280

279281
// 2 gates
282+
// only update current_key_index_and_length if the token is a key token
280283
let diff = (index + length * 0x10000) - current_key_index_and_length;
281284
std::as_witness(diff);
282-
current_key_index_and_length = diff * update_key + current_key_index_and_length;
285+
current_key_index_and_length = diff * is_key_token + current_key_index_and_length;
283286
std::as_witness(current_key_index_and_length);
284287

285288
// 2 gates
@@ -294,12 +297,13 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
294297
);
295298
// subtotal 22.25
296299
// 1 gate
300+
// set new_entry to object_or_array_entry if the token is end of an object or array, or set new_entry to value_entry
297301
let depth_index: Field = (depth - 1);
298302
// 3.5 gates
299303
let previous_stack_entry_packed = parent_context_stack[cast_num_to_u32(depth_index)];
300304

301305
// 9.5 gates
302-
let previous_stack_entry =
306+
let previous_stack_entry: JSONContextStackEntry =
303307
JSONContextStackEntry::from_field(previous_stack_entry_packed);
304308

305309
let object_or_array_entry: JSONEntry = JSONEntry {
@@ -337,6 +341,10 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
337341

338342
// 3 gates
339343
// subtotal 24 + 22.25 = 46.25
344+
// if preserve_num_entries (i.e. not start or end of object or array) is 1, then current_identity_value does not change.
345+
// if is_start_of_object_or_array is 1, then current_identity_value is set to next_identity_value.
346+
// if is_end_of_object_or_array is 1, then current_identity_value is set to previous_stack_entry.current_identity.
347+
340348
let old = current_identity_value;
341349
current_identity_value = (next_identity_value * is_start_of_object_or_array);
342350
std::as_witness(current_identity_value);
@@ -347,6 +355,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
347355
std::as_witness(current_identity_value);
348356

349357
// 2 gates
358+
// If we ses a value token (string/number/literal), we add 1 to count. If we see , or :, no change.
359+
// If preserve_num_entries is 0 (i.e. start or end of object or array) then we reset variable to 0.
350360
num_entries_at_current_depth =
351361
num_entries_at_current_depth * preserve_num_entries + is_value_token;
352362
std::as_witness(num_entries_at_current_depth);
@@ -356,11 +366,15 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
356366

357367
// 1 gate
358368
// if `is_end_of_object_or_array == 1`, `new_context = 0` so we can do something cheaper than a conditional select:
369+
// If is_end_of_object_or_array is 1, then new_context is 0, so set context = previous_stack_entry.context
370+
// If is_end_of_object_or_array is 0, then set context = new_context
359371
context = cast_num_to_u32(
360372
previous_stack_entry.context * is_end_of_object_or_array + new_context,
361373
);
362374
std::as_witness(context as Field);
363375
// 3 gates
376+
// If context is 0 (object context), then don't take the num_entries_at_current_depth term into account
377+
// because searching for a key only depends of the key name, not position, as opposed to array context where we need to look up by position/index.
364378
let common_term = current_identity_value
365379
+ context as Field * (num_entries_at_current_depth - 1) * 0x1000000000000;
366380
std::as_witness(common_term);
@@ -374,14 +388,14 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
374388
std::as_witness(new_key_data);
375389

376390
// 3.5 gates
377-
self.key_data[cast_num_to_u32(entry_ptr)] = new_key_data * create_json_entry;
391+
self.key_data[entry_ptr] = new_key_data * create_json_entry as Field;
378392

379393
// 3.5 gates
380394
parent_context_stack[cast_num_to_u32(depth)] = new_context_stack_entry;
381395

382396
// 4.5 gates
383-
self.json_entries_packed[cast_num_to_u32(entry_ptr)] =
384-
JSONEntryPacked { value: new_entry * create_json_entry };
397+
self.json_entries_packed[entry_ptr] =
398+
JSONEntryPacked { value: new_entry * create_json_entry as Field };
385399

386400
// 1 gate
387401
next_identity_value = next_identity_value + is_start_of_object_or_array;
@@ -393,8 +407,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
393407
// 1 gate
394408
// 2105 + 46.25
395409
// subtotal 66.75?
396-
entry_ptr += create_json_entry;
397-
std::as_witness(entry_ptr);
410+
entry_ptr += create_json_entry as u32;
398411
}
399412
self.validate_tokens(tokens);
400413
}

src/json_entry.nr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ impl std::cmp::Eq for JSONEntry {
235235
let num_children = (self.num_children == other.num_children);
236236
let json_pointer = (self.json_pointer == other.json_pointer);
237237
let json_length = (self.json_length == other.json_length);
238-
array_ptr | entry | child | num_children | json_pointer | json_length
238+
array_ptr & entry & child & num_children & json_pointer & json_length
239239
}
240240
}
241241

src/keymap.nr

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
6262
let KeyIndexData { json_index, json_length, parent_id, array_index } =
6363
KeyIndexData::from_field(self.key_data[i]);
6464
let hash = hasher.get_keyhash(self.json_packed, json_index, json_length);
65+
//ensures hash:0-199 bits, array_index:200-215 bits, parent_id: 216-239 bits
6566
hashlist[i] = hash + array_index * two_pow_200 + parent_id * two_pow_216;
6667
}
6768

@@ -72,49 +73,41 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
7273

7374
let mut sorted_entries: [JSONEntryPacked; MaxNumValues] =
7475
[JSONEntryPacked::default(); MaxNumValues];
76+
7577
for i in 0..MaxNumValues {
7678
sorted_entries[sort_result.sort_indices[i]] = self.json_entries_packed[i];
7779
}
7880

79-
let mut ids: [Field; MaxNumValues] = [0; MaxNumValues];
8081
let mut parent_indices: [Field; MaxNumValues] = [0; MaxNumValues];
81-
let mut entry_types: [Field; MaxNumValues] = [0; MaxNumValues];
8282

83+
let mut identity_to_json_map: [Field; MaxNumValues] = [0; MaxNumValues];
8384
for i in 0..MaxNumValues {
8485
// 11.75 + 3.5 = 15.25 gates per iteration
8586
let (id, parent_index, entry_type) = JSONEntry::extract_entry_type_id_and_parent_index_from_field(
8687
sorted_entries[i].value,
8788
);
88-
ids[i] = id;
8989
parent_indices[i] = parent_index;
90-
entry_types[i] = entry_type;
91-
}
92-
93-
let mut identity_to_json_map: [Field; MaxNumValues] = [0; MaxNumValues];
94-
// 6.5 gates per iteration
95-
for i in 0..MaxNumValues {
96-
let id = ids[i];
97-
let entry_type = entry_types[i];
9890
// 2 gates
91+
// update is 1 for end of object/array, 0 for other
9992
let update = TOKEN_ENDS_OBJECT_OR_ARRAY[cast_num_to_u32(entry_type)];
10093
// NOTE THIS RELIES ON MaxNumValues ACTUALLY DESCRIBING NUMMaxNumValues + 1
94+
// index = id if update = 1, else MaxNumValues -1
10195
// 1 gate
10296
let index = (id - (MaxNumValues as Field - 1)) * update + (MaxNumValues as Field - 1);
10397
// 3.5 gates
10498
identity_to_json_map[cast_num_to_u32(index)] = i as Field;
10599
}
106-
107100
// 13.5 gates per iteration
108101
let mut parent_identity_pre = parent_indices[0];
109102
for i in 1..MaxNumValues {
110103
let parent_identity_post = parent_indices[i];
111104
// if the parent identity changes,
112105
// 3.5 gate
113-
// the list is sorted according to parent_ideneity,
106+
// the list is sorted according to parent_identity,
114107
// n.b. parent_identity_post - parent_identity_pre is not neccessarily 0 or 1 (can be larger)
115108
// due to empty objects and arrays increasing identity value without creating associated child json entries
116-
let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field;
117109
// let new_parent = (parent_identity_post as u32 > parent_identity_pre as u32) as Field;
110+
let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field;
118111
// 3.5 gates
119112
let index_of_parent = identity_to_json_map[cast_num_to_u32(parent_identity_post)];
120113
// 1 gate + 3.5 gates
@@ -127,11 +120,12 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
127120
// 1 gate
128121
let index = (index_of_parent * new_parent);
129122
// 3.5 gates
123+
//index is just 0 if new_parent is false, so sorted_entries[0] is useless info
130124
sorted_entries[cast_num_to_u32(index)] = JSONEntryPacked { value: updated };
131125

132126
parent_identity_pre = parent_identity_post;
133127
}
134-
sorted_entries[0] = JSONEntryPacked::default(); // TODO document why we want to always make 0 a dead entry
128+
sorted_entries[0] = JSONEntryPacked::default();
135129
self.unsorted_json_entries_packed = self.json_entries_packed;
136130
self.json_entries_packed = sorted_entries;
137131
self.key_hashes = sort_result.sorted;

src/token_flags.nr

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pub(crate) struct TokenFlags {
2-
pub(crate) create_json_entry: Field,
2+
pub(crate) create_json_entry: bool,
33
pub(crate) is_end_of_object_or_array: Field,
44
pub(crate) is_start_of_object_or_array: Field,
55
pub(crate) new_context: Field,
@@ -12,7 +12,7 @@ impl TokenFlags {
1212

1313
unconstrained fn __from_field(f: Field) -> Self {
1414
let bytes: [u8; 7] = f.to_be_bytes();
15-
let create_json_entry = bytes[0] as Field;
15+
let create_json_entry = bytes[0] != 0;
1616
let is_end_of_object_or_array = bytes[1] as Field;
1717
let is_start_of_object_or_array = bytes[2] as Field;
1818
let new_context = bytes[3] as Field;
@@ -37,7 +37,6 @@ impl TokenFlags {
3737
let r = unsafe { TokenFlags::__from_field(f) };
3838

3939
// checks that the flags are binary
40-
assert(r.create_json_entry * r.create_json_entry == r.create_json_entry);
4140
assert(
4241
r.is_end_of_object_or_array * r.is_end_of_object_or_array
4342
== r.is_end_of_object_or_array,
@@ -64,12 +63,12 @@ impl TokenFlags {
6463
+ self.new_context * 0x1000000
6564
+ self.is_start_of_object_or_array * 0x100000000
6665
+ self.is_end_of_object_or_array * 0x10000000000
67-
+ self.create_json_entry * 0x1000000000000
66+
+ self.create_json_entry as Field * 0x1000000000000
6867
}
6968

7069
pub(crate) fn default() -> Self {
7170
TokenFlags {
72-
create_json_entry: 0,
71+
create_json_entry: false,
7372
is_end_of_object_or_array: 0,
7473
is_start_of_object_or_array: 0,
7574
new_context: 0,

0 commit comments

Comments
 (0)