fix JSONEntry Eq and add documentation

jialinli98 · jialinli98 · commit 246e245695ba · 2025-08-04T01:21:44.000-07:00
diff --git a/README.md b/README.md
@@ -122,6 +122,27 @@ e.g. to take the existing 1kb JSON paramters, but also support 124-byte keys, us
 
 If you are deriving a key to look up in-circuit and you do not know the maximum length of the key, all query methods have a version with a `_var` suffix (e.g. `JSON::get_string_var`), which accepts the key as a `BoundedVec`
 
+#  Architecture
+### Overview
+The JSON parser uses 5 steps to efficiently parse and index JSON data:
+
+1. **build_transcript** - Convert raw bytes to a transcript of tokens using state machine defined by by JSON_CAPTURE_TABLE. Categorize each character as string, number, ...
+2. **capture_missing_tokens & keyswap** - Fix missing tokens and correctly identify keys. Complete a second scan of the tokens, check for missing tokens (e.g.commas after literals), and for strings that are keys to an object, relabel them as keys, 
+3. **compute_json_packed** - Pack bytes into Field elements for efficient substring extraction
+4. **create_json_entries** - Create structured JSON entries with parent-child relationships
+5. **compute_keyhash_and_sort_json_entries** - Sort entries by key hash for efficient lookups
+
+### Key Design Patterns
+- **Using table lookups**: Uses many lookup tables to avoid branching logic to reduce circuit size
+- **Packing data to Field elements**: Combines multiple fields that encodes different features into a single Field element for comparison
+
+### Table Generation
+The parser uses several lookup tables generated from `src/_table_generation/`:
+- `TOKEN_FLAGS_TABLE`: State transitions for token processing
+- `JSON_CAPTURE_TABLE`: Character-by-character parsing rules
+- `TOKEN_VALIDATION_TABLE`: JSON grammar validation
+
+
 # Acknowledgements
 
 Many thanks to the authors of the OG noir json library https://github.com/RontoSOFT/noir-json-parser
diff --git a/src/_table_generation/make_tables.nr b/src/_table_generation/make_tables.nr
@@ -384,7 +384,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     let mut flags: [TokenFlags; NUM_TOKENS * 2] = [TokenFlags::default(); NUM_TOKENS * 2];
 
     let mut no_token_flags: TokenFlags = TokenFlags {
-        create_json_entry: 0,
+        create_json_entry: false,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 0,
         new_context: OBJECT_LAYER as Field,
@@ -393,7 +393,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
         preserve_num_entries: 1,
     };
     let mut key_token_flags: TokenFlags = TokenFlags {
-        create_json_entry: 0,
+        create_json_entry: false,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 0,
         new_context: OBJECT_LAYER as Field,
@@ -402,7 +402,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
         preserve_num_entries: 1,
     };
     let begin_object_flags = TokenFlags {
-        create_json_entry: 0,
+        create_json_entry: false,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 1,
         new_context: OBJECT_LAYER as Field,
@@ -412,7 +412,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let begin_array_flags = TokenFlags {
-        create_json_entry: 0,
+        create_json_entry: false,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 1,
         new_context: ARRAY_LAYER as Field,
@@ -422,7 +422,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let mut end_object_flags = TokenFlags {
-        create_json_entry: 1,
+        create_json_entry: true,
         is_end_of_object_or_array: 1,
         is_start_of_object_or_array: 0,
         new_context: 0,
@@ -432,7 +432,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let mut end_array_flags = TokenFlags {
-        create_json_entry: 1,
+        create_json_entry: true,
         is_end_of_object_or_array: 1,
         is_start_of_object_or_array: 0,
         new_context: 0,
@@ -442,7 +442,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let mut string_flags = TokenFlags {
-        create_json_entry: 1,
+        create_json_entry: true,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 0,
         new_context: OBJECT_LAYER as Field,
@@ -452,7 +452,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let mut numeric_flags = TokenFlags {
-        create_json_entry: 1,
+        create_json_entry: true,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 0,
         new_context: OBJECT_LAYER as Field,
@@ -462,7 +462,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     };
 
     let mut literal_flags = TokenFlags {
-        create_json_entry: 1,
+        create_json_entry: true,
         is_end_of_object_or_array: 0,
         is_start_of_object_or_array: 0,
         new_context: OBJECT_LAYER as Field,
diff --git a/src/_table_generation/table_generation.md b/src/_table_generation/table_generation.md
@@ -0,0 +1,25 @@
+# Table Generation Documentation
+
+## Overview
+The JSON parser uses lookup tables to avoid branching logic and reduce gate count. These tables are generated from `src/_table_generation/make_tables.nr`.
+
+## Generation Process
+Tables are generated by simulating all possible input combinations from basic hardcoded tables and recording the expected outputs.
+
+## TOKEN_FLAGS_TABLE
+Maps (token, context) pairs to parsing flags:
+- `create_json_entry`: Whether to create a JSON entry for this token, set to true if token is literal/number/string(not key)/end of array/object
+- `is_end_of_object_or_array`: Whether token ends an object/array
+- `is_start_of_object_or_array`: Whether token starts an object/array
+- `new_context`: What context to switch to, object is 0, array is 1
+- `is_key_token`: Whether token is a key
+- `is_value_token`: Whether token is a value, set to True for string_token, numeric_token, and literal_token
+- `preserve_num_entries`: boolean flag that controls whether the current token should preserve the existing count of entries at the current depth or reset/increment it. 1 for tokens like NO_TOKEN, KEY_TOKEN, STRING_TOKEN, NUMERIC_TOKEN, LITERAL_TOKEN
+0 for tokens like OBJECT_START_TOKEN, ARRAY_START_TOKEN, OBJECT_END_TOKEN, ARRAY_END_TOKEN
+
+## JSON_CAPTURE_TABLE
+Maps (escape_flag, scan_mode, ascii) to scanning actions:
+- `scan_token`: Next capture mode based on current capture mode, can be grammar_capture([,{,comma,},],:)/string_capture/literal_capture/numeric_capture/error_capture. For example, if currently we are in string capture, and character is ", then scan_token will be set to grammar_capture because we are at end of string, back to grammar scan. If we are in numeric scan, and current character is not 0-9, then we are back to grammar scan as we expect the number has ended.
+- `push_transcript`: Whether to add token to transcript: in grammar mode: true for all structual elements[,{,comma,},],:. In string_capture, true for ", which signals string end. In numeric/literal_capture, true for space, \t, \n, \r, ", and comma. Note the first scan will not pick up numerics or literals because we don't know when they end, so we need to rely on capture_missing_tokens function.
+- `increase_length`: Whether to extend current token, always false for grammar_capture, true for 0-9 in numeric capture, all characters except for " in string_capture, all letters in true, false, null in literal_capture
+- `is_potential_escape_sequence`: true if current token is / in string_capture mode
diff --git a/src/json.nr b/src/json.nr
@@ -77,8 +77,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
     }
 }
 
-unconstrained fn __check_entry_ptr_bounds(entry_ptr: Field, max: u32) {
-    entry_ptr.assert_max_bit_size::<32>();
+unconstrained fn __check_entry_ptr_bounds(entry_ptr: u32, max: u32) {
     // n.b. even though this assert is in an unconstrained function, an out of bounds error will be triggered when writing into self.key_data[entry_ptr]
     assert(entry_ptr as u32 < max - 1, "create_json_entries: MaxNumValues limit exceeded!");
 }
@@ -244,15 +243,18 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
      *              7. Is the token one that we should skip over? `,` or `:`
      **/
     fn create_json_entries(&mut self) {
-        let mut entry_ptr = 0;
+        let mut entry_ptr: u32 = 0;
         let mut depth: Field = 1;
         let mut num_entries_at_current_depth: Field = 0;
         let mut next_identity_value: Field = 1;
         let mut current_identity_value: Field = 0;
+        // context: 0 for object, 1 for array
         let mut context = OBJECT_LAYER;
 
+        //lower 2 bytes is the index, upper 2 bytes is the length
         let mut current_key_index_and_length: Field = 0;
 
+        //stack won't pop elements, but will push new elements by overwriting the top element
         let mut parent_context_stack: [Field; 32] = [0; 32];
         let mut tokens: [Field; MaxNumTokens] = [0; MaxNumTokens];
         //  maybe 71.75 gates per iteration
@@ -270,16 +272,17 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
     is_end_of_object_or_array,
     is_start_of_object_or_array,
     new_context,
-    is_key_token: update_key,
+    is_key_token,
     is_value_token,
     preserve_num_entries,} = TokenFlags::from_field(
                 TOKEN_FLAGS_TABLE[cast_num_to_u32(token) + context * NUM_TOKENS],
             );
 
             // 2 gates
+            // only update current_key_index_and_length if the token is a key token
             let diff = (index + length * 0x10000) - current_key_index_and_length;
             std::as_witness(diff);
-            current_key_index_and_length = diff * update_key + current_key_index_and_length;
+            current_key_index_and_length = diff * is_key_token + current_key_index_and_length;
             std::as_witness(current_key_index_and_length);
 
             // 2 gates
@@ -294,12 +297,13 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             );
             // subtotal 22.25
             // 1 gate
+            // set new_entry to object_or_array_entry if the token is end of an object or array, or set new_entry to value_entry
             let depth_index: Field = (depth - 1);
             // 3.5 gates
             let previous_stack_entry_packed = parent_context_stack[cast_num_to_u32(depth_index)];
 
             // 9.5 gates
-            let previous_stack_entry =
+            let previous_stack_entry: JSONContextStackEntry =
                 JSONContextStackEntry::from_field(previous_stack_entry_packed);
 
             let object_or_array_entry: JSONEntry = JSONEntry {
@@ -337,6 +341,10 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
 
             // 3 gates
             // subtotal 24 + 22.25 = 46.25
+            // if preserve_num_entries (i.e. not start or end of object or array) is 1, then current_identity_value does not change.
+            // if is_start_of_object_or_array is 1, then current_identity_value is set to next_identity_value.
+            // if is_end_of_object_or_array is 1, then current_identity_value is set to previous_stack_entry.current_identity.
+
             let old = current_identity_value;
             current_identity_value = (next_identity_value * is_start_of_object_or_array);
             std::as_witness(current_identity_value);
@@ -347,6 +355,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             std::as_witness(current_identity_value);
 
             // 2 gates
+            // If we ses a value token (string/number/literal), we add 1 to count. If we see , or :, no change.
+            // If preserve_num_entries is 0 (i.e. start or end of object or array) then we reset variable to 0.
             num_entries_at_current_depth =
                 num_entries_at_current_depth * preserve_num_entries + is_value_token;
             std::as_witness(num_entries_at_current_depth);
@@ -356,11 +366,15 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
 
             // 1 gate
             // if `is_end_of_object_or_array == 1`, `new_context = 0` so we can do something cheaper than a conditional select:
+            // If is_end_of_object_or_array is 1, then new_context is 0, so set context = previous_stack_entry.context
+            // If is_end_of_object_or_array is 0, then set context = new_context
             context = cast_num_to_u32(
                 previous_stack_entry.context * is_end_of_object_or_array + new_context,
             );
             std::as_witness(context as Field);
             // 3 gates
+            // If context is 0 (object context), then don't take the num_entries_at_current_depth term into account
+            // because searching for a key only depends of the key name, not position, as opposed to array context where we need to look up by position/index.
             let common_term = current_identity_value
                 + context as Field * (num_entries_at_current_depth - 1) * 0x1000000000000;
             std::as_witness(common_term);
@@ -374,14 +388,14 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             std::as_witness(new_key_data);
 
             // 3.5 gates
-            self.key_data[cast_num_to_u32(entry_ptr)] = new_key_data * create_json_entry;
+            self.key_data[entry_ptr] = new_key_data * create_json_entry as Field;
 
             // 3.5 gates
             parent_context_stack[cast_num_to_u32(depth)] = new_context_stack_entry;
 
             // 4.5 gates
-            self.json_entries_packed[cast_num_to_u32(entry_ptr)] =
-                JSONEntryPacked { value: new_entry * create_json_entry };
+            self.json_entries_packed[entry_ptr] =
+                JSONEntryPacked { value: new_entry * create_json_entry as Field };
 
             // 1 gate
             next_identity_value = next_identity_value + is_start_of_object_or_array;
@@ -393,8 +407,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             // 1 gate
             // 2105 + 46.25
             // subtotal 66.75?
-            entry_ptr += create_json_entry;
-            std::as_witness(entry_ptr);
+            entry_ptr += create_json_entry as u32;
         }
         self.validate_tokens(tokens);
     }
diff --git a/src/json_entry.nr b/src/json_entry.nr
@@ -235,7 +235,7 @@ impl std::cmp::Eq for JSONEntry {
         let num_children = (self.num_children == other.num_children);
         let json_pointer = (self.json_pointer == other.json_pointer);
         let json_length = (self.json_length == other.json_length);
-        array_ptr | entry | child | num_children | json_pointer | json_length
+        array_ptr & entry & child & num_children & json_pointer & json_length
     }
 }
 
diff --git a/src/keymap.nr b/src/keymap.nr
@@ -62,6 +62,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             let KeyIndexData { json_index, json_length, parent_id, array_index } =
                 KeyIndexData::from_field(self.key_data[i]);
             let hash = hasher.get_keyhash(self.json_packed, json_index, json_length);
+            //ensures hash:0-199 bits, array_index:200-215 bits, parent_id: 216-239 bits
             hashlist[i] = hash + array_index * two_pow_200 + parent_id * two_pow_216;
         }
 
@@ -72,49 +73,41 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
 
         let mut sorted_entries: [JSONEntryPacked; MaxNumValues] =
             [JSONEntryPacked::default(); MaxNumValues];
+
         for i in 0..MaxNumValues {
             sorted_entries[sort_result.sort_indices[i]] = self.json_entries_packed[i];
         }
 
-        let mut ids: [Field; MaxNumValues] = [0; MaxNumValues];
         let mut parent_indices: [Field; MaxNumValues] = [0; MaxNumValues];
-        let mut entry_types: [Field; MaxNumValues] = [0; MaxNumValues];
 
+        let mut identity_to_json_map: [Field; MaxNumValues] = [0; MaxNumValues];
         for i in 0..MaxNumValues {
             // 11.75 + 3.5 = 15.25 gates per iteration
             let (id, parent_index, entry_type) = JSONEntry::extract_entry_type_id_and_parent_index_from_field(
                 sorted_entries[i].value,
             );
-            ids[i] = id;
             parent_indices[i] = parent_index;
-            entry_types[i] = entry_type;
-        }
-
-        let mut identity_to_json_map: [Field; MaxNumValues] = [0; MaxNumValues];
-        // 6.5 gates per iteration
-        for i in 0..MaxNumValues {
-            let id = ids[i];
-            let entry_type = entry_types[i];
             // 2 gates
+            // update is 1 for end of object/array, 0 for other
             let update = TOKEN_ENDS_OBJECT_OR_ARRAY[cast_num_to_u32(entry_type)];
             // NOTE THIS RELIES ON MaxNumValues ACTUALLY DESCRIBING NUMMaxNumValues + 1
+            // index = id if update = 1, else MaxNumValues -1
             // 1 gate
             let index = (id - (MaxNumValues as Field - 1)) * update + (MaxNumValues as Field - 1);
             // 3.5 gates
             identity_to_json_map[cast_num_to_u32(index)] = i as Field;
         }
-
         // 13.5 gates per iteration
         let mut parent_identity_pre = parent_indices[0];
         for i in 1..MaxNumValues {
             let parent_identity_post = parent_indices[i];
             // if the parent identity changes,
             // 3.5 gate
-            // the list is sorted according to parent_ideneity,
+            // the list is sorted according to parent_identity,
             // n.b. parent_identity_post - parent_identity_pre is not neccessarily 0 or 1 (can be larger)
             //      due to empty objects and arrays increasing identity value without creating associated child json entries
-            let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field;
             // let new_parent = (parent_identity_post as u32 > parent_identity_pre as u32) as Field;
+            let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field;
             // 3.5 gates
             let index_of_parent = identity_to_json_map[cast_num_to_u32(parent_identity_post)];
             // 1 gate + 3.5 gates
@@ -127,11 +120,12 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
             // 1 gate
             let index = (index_of_parent * new_parent);
             // 3.5 gates
+            //index is just 0 if new_parent is false, so sorted_entries[0] is useless info
             sorted_entries[cast_num_to_u32(index)] = JSONEntryPacked { value: updated };
 
             parent_identity_pre = parent_identity_post;
         }
-        sorted_entries[0] = JSONEntryPacked::default(); // TODO document why we want to always make 0 a dead entry
+        sorted_entries[0] = JSONEntryPacked::default();
         self.unsorted_json_entries_packed = self.json_entries_packed;
         self.json_entries_packed = sorted_entries;
         self.key_hashes = sort_result.sorted;
diff --git a/src/token_flags.nr b/src/token_flags.nr

Original file line number	Diff line number	Diff line change
`@@ -235,7 +235,7 @@ impl std::cmp::Eq for JSONEntry {`
`235`	`235`	`let num_children = (self.num_children == other.num_children);`
`236`	`236`	`let json_pointer = (self.json_pointer == other.json_pointer);`
`237`	`237`	`let json_length = (self.json_length == other.json_length);`
`238`		`- array_ptr \| entry \| child \| num_children \| json_pointer \| json_length`
	`238`	`+ array_ptr & entry & child & num_children & json_pointer & json_length`
`239`	`239`	`}`
`240`	`240`	`}`
`241`	`241`