From 246e245695ba96c19e45089951a0090d8f12f499 Mon Sep 17 00:00:00 2001 From: jialinli Date: Thu, 31 Jul 2025 20:46:04 -0700 Subject: [PATCH 01/13] fix JSONEntry Eq and add documentation --- README.md | 21 ++++++++++++++ src/_table_generation/make_tables.nr | 18 ++++++------ src/_table_generation/table_generation.md | 25 ++++++++++++++++ src/json.nr | 35 ++++++++++++++++------- src/json_entry.nr | 2 +- src/keymap.nr | 24 ++++++---------- src/token_flags.nr | 9 +++--- 7 files changed, 93 insertions(+), 41 deletions(-) create mode 100644 src/_table_generation/table_generation.md diff --git a/README.md b/README.md index d9ff043..dcacbb7 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,27 @@ e.g. to take the existing 1kb JSON paramters, but also support 124-byte keys, us If you are deriving a key to look up in-circuit and you do not know the maximum length of the key, all query methods have a version with a `_var` suffix (e.g. `JSON::get_string_var`), which accepts the key as a `BoundedVec` +# Architecture +### Overview +The JSON parser uses 5 steps to efficiently parse and index JSON data: + +1. **build_transcript** - Convert raw bytes to a transcript of tokens using state machine defined by by JSON_CAPTURE_TABLE. Categorize each character as string, number, ... +2. **capture_missing_tokens & keyswap** - Fix missing tokens and correctly identify keys. Complete a second scan of the tokens, check for missing tokens (e.g.commas after literals), and for strings that are keys to an object, relabel them as keys, +3. **compute_json_packed** - Pack bytes into Field elements for efficient substring extraction +4. **create_json_entries** - Create structured JSON entries with parent-child relationships +5. **compute_keyhash_and_sort_json_entries** - Sort entries by key hash for efficient lookups + +### Key Design Patterns +- **Using table lookups**: Uses many lookup tables to avoid branching logic to reduce circuit size +- **Packing data to Field elements**: Combines multiple fields that encodes different features into a single Field element for comparison + +### Table Generation +The parser uses several lookup tables generated from `src/_table_generation/`: +- `TOKEN_FLAGS_TABLE`: State transitions for token processing +- `JSON_CAPTURE_TABLE`: Character-by-character parsing rules +- `TOKEN_VALIDATION_TABLE`: JSON grammar validation + + # Acknowledgements Many thanks to the authors of the OG noir json library https://github.com/RontoSOFT/noir-json-parser diff --git a/src/_table_generation/make_tables.nr b/src/_table_generation/make_tables.nr index 64e5921..a2a9086 100644 --- a/src/_table_generation/make_tables.nr +++ b/src/_table_generation/make_tables.nr @@ -384,7 +384,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { let mut flags: [TokenFlags; NUM_TOKENS * 2] = [TokenFlags::default(); NUM_TOKENS * 2]; let mut no_token_flags: TokenFlags = TokenFlags { - create_json_entry: 0, + create_json_entry: false, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: OBJECT_LAYER as Field, @@ -393,7 +393,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { preserve_num_entries: 1, }; let mut key_token_flags: TokenFlags = TokenFlags { - create_json_entry: 0, + create_json_entry: false, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: OBJECT_LAYER as Field, @@ -402,7 +402,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { preserve_num_entries: 1, }; let begin_object_flags = TokenFlags { - create_json_entry: 0, + create_json_entry: false, is_end_of_object_or_array: 0, is_start_of_object_or_array: 1, new_context: OBJECT_LAYER as Field, @@ -412,7 +412,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let begin_array_flags = TokenFlags { - create_json_entry: 0, + create_json_entry: false, is_end_of_object_or_array: 0, is_start_of_object_or_array: 1, new_context: ARRAY_LAYER as Field, @@ -422,7 +422,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let mut end_object_flags = TokenFlags { - create_json_entry: 1, + create_json_entry: true, is_end_of_object_or_array: 1, is_start_of_object_or_array: 0, new_context: 0, @@ -432,7 +432,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let mut end_array_flags = TokenFlags { - create_json_entry: 1, + create_json_entry: true, is_end_of_object_or_array: 1, is_start_of_object_or_array: 0, new_context: 0, @@ -442,7 +442,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let mut string_flags = TokenFlags { - create_json_entry: 1, + create_json_entry: true, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: OBJECT_LAYER as Field, @@ -452,7 +452,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let mut numeric_flags = TokenFlags { - create_json_entry: 1, + create_json_entry: true, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: OBJECT_LAYER as Field, @@ -462,7 +462,7 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] { }; let mut literal_flags = TokenFlags { - create_json_entry: 1, + create_json_entry: true, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: OBJECT_LAYER as Field, diff --git a/src/_table_generation/table_generation.md b/src/_table_generation/table_generation.md new file mode 100644 index 0000000..2bd87fb --- /dev/null +++ b/src/_table_generation/table_generation.md @@ -0,0 +1,25 @@ +# Table Generation Documentation + +## Overview +The JSON parser uses lookup tables to avoid branching logic and reduce gate count. These tables are generated from `src/_table_generation/make_tables.nr`. + +## Generation Process +Tables are generated by simulating all possible input combinations from basic hardcoded tables and recording the expected outputs. + +## TOKEN_FLAGS_TABLE +Maps (token, context) pairs to parsing flags: +- `create_json_entry`: Whether to create a JSON entry for this token, set to true if token is literal/number/string(not key)/end of array/object +- `is_end_of_object_or_array`: Whether token ends an object/array +- `is_start_of_object_or_array`: Whether token starts an object/array +- `new_context`: What context to switch to, object is 0, array is 1 +- `is_key_token`: Whether token is a key +- `is_value_token`: Whether token is a value, set to True for string_token, numeric_token, and literal_token +- `preserve_num_entries`: boolean flag that controls whether the current token should preserve the existing count of entries at the current depth or reset/increment it. 1 for tokens like NO_TOKEN, KEY_TOKEN, STRING_TOKEN, NUMERIC_TOKEN, LITERAL_TOKEN +0 for tokens like OBJECT_START_TOKEN, ARRAY_START_TOKEN, OBJECT_END_TOKEN, ARRAY_END_TOKEN + +## JSON_CAPTURE_TABLE +Maps (escape_flag, scan_mode, ascii) to scanning actions: +- `scan_token`: Next capture mode based on current capture mode, can be grammar_capture([,{,comma,},],:)/string_capture/literal_capture/numeric_capture/error_capture. For example, if currently we are in string capture, and character is ", then scan_token will be set to grammar_capture because we are at end of string, back to grammar scan. If we are in numeric scan, and current character is not 0-9, then we are back to grammar scan as we expect the number has ended. +- `push_transcript`: Whether to add token to transcript: in grammar mode: true for all structual elements[,{,comma,},],:. In string_capture, true for ", which signals string end. In numeric/literal_capture, true for space, \t, \n, \r, ", and comma. Note the first scan will not pick up numerics or literals because we don't know when they end, so we need to rely on capture_missing_tokens function. +- `increase_length`: Whether to extend current token, always false for grammar_capture, true for 0-9 in numeric capture, all characters except for " in string_capture, all letters in true, false, null in literal_capture +- `is_potential_escape_sequence`: true if current token is / in string_capture mode diff --git a/src/json.nr b/src/json.nr index 660598d..7c497a8 100644 --- a/src/json.nr +++ b/src/json.nr @@ -77,8 +77,7 @@ impl(); +unconstrained fn __check_entry_ptr_bounds(entry_ptr: u32, max: u32) { // n.b. even though this assert is in an unconstrained function, an out of bounds error will be triggered when writing into self.key_data[entry_ptr] assert(entry_ptr as u32 < max - 1, "create_json_entries: MaxNumValues limit exceeded!"); } @@ -244,15 +243,18 @@ impl parent_identity_pre as u32) as Field; + let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field; // 3.5 gates let index_of_parent = identity_to_json_map[cast_num_to_u32(parent_identity_post)]; // 1 gate + 3.5 gates @@ -127,11 +120,12 @@ impl Self { let bytes: [u8; 7] = f.to_be_bytes(); - let create_json_entry = bytes[0] as Field; + let create_json_entry = bytes[0] != 0; let is_end_of_object_or_array = bytes[1] as Field; let is_start_of_object_or_array = bytes[2] as Field; let new_context = bytes[3] as Field; @@ -37,7 +37,6 @@ impl TokenFlags { let r = unsafe { TokenFlags::__from_field(f) }; // checks that the flags are binary - assert(r.create_json_entry * r.create_json_entry == r.create_json_entry); assert( r.is_end_of_object_or_array * r.is_end_of_object_or_array == r.is_end_of_object_or_array, @@ -64,12 +63,12 @@ impl TokenFlags { + self.new_context * 0x1000000 + self.is_start_of_object_or_array * 0x100000000 + self.is_end_of_object_or_array * 0x10000000000 - + self.create_json_entry * 0x1000000000000 + + self.create_json_entry as Field * 0x1000000000000 } pub(crate) fn default() -> Self { TokenFlags { - create_json_entry: 0, + create_json_entry: false, is_end_of_object_or_array: 0, is_start_of_object_or_array: 0, new_context: 0, From c29d1793f0a808c98f6785fe21fd250c2526d12c Mon Sep 17 00:00:00 2001 From: zac-williamson Date: Mon, 4 Aug 2025 16:28:32 +0100 Subject: [PATCH 02/13] zac's documentation --- src/_string_tools/slice_packed_field.nr | 2 +- src/json.nr | 328 +++++++++++++++++------- src/token_flags.nr | 25 +- 3 files changed, 255 insertions(+), 100 deletions(-) diff --git a/src/_string_tools/slice_packed_field.nr b/src/_string_tools/slice_packed_field.nr index 6c206f8..31348da 100644 --- a/src/_string_tools/slice_packed_field.nr +++ b/src/_string_tools/slice_packed_field.nr @@ -883,7 +883,7 @@ mod test { for j in 0..18 { let start_byte: u32 = text.len() - num_bytes - byte_positions[j]; let mut expected_slices: [Field; 3] = - // Safety: this is a test + // Safety: this is a test unsafe { build_slices_for_test(text, start_byte, num_bytes) }; let result_slices: [Field; 3] = slice_fields(slices, start_byte as Field, num_bytes as Field); diff --git a/src/json.nr b/src/json.nr index 7c497a8..af8e2d4 100644 --- a/src/json.nr +++ b/src/json.nr @@ -224,34 +224,62 @@ impl object, context == 1 => array) + // If current token is END_OBJECT_TOKEN or END_ARRAY_TOKEN, set context to the context value in previous_stack_entry + // (i.e. restore the context to whatever the parent of the object/array is) + // Pseudocode: + // if (is_end_of_object_or_array) { + // context = previous_stack_entry.context + // } else { + // context = new_context + // } // 1 gate // if `is_end_of_object_or_array == 1`, `new_context = 0` so we can do something cheaper than a conditional select: // If is_end_of_object_or_array is 1, then new_context is 0, so set context = previous_stack_entry.context @@ -372,6 +486,25 @@ impl Self { let bytes: [u8; 7] = f.to_be_bytes(); let create_json_entry = bytes[0] != 0; @@ -31,6 +51,7 @@ impl TokenFlags { } } + /// Convert a Field element that contains a packed TokenFlags object into a real TokenFlags object pub(crate) fn from_field(f: Field) -> Self { // 10 gates // Safety: check the comments below @@ -55,7 +76,8 @@ impl TokenFlags { r } - // 4 gates + /// Pack a TokenFlags object into a Field element + /// 4 gates pub(crate) fn to_field(self) -> Field { self.preserve_num_entries + self.is_value_token * 0x100 @@ -66,6 +88,7 @@ impl TokenFlags { + self.create_json_entry as Field * 0x1000000000000 } + /// Default constructor pub(crate) fn default() -> Self { TokenFlags { create_json_entry: false, From c51ca3a94a0901f7d889f581b802d2042526e08c Mon Sep 17 00:00:00 2001 From: jialinli Date: Tue, 5 Aug 2025 00:30:10 -0700 Subject: [PATCH 03/13] remove unused function --- src/_table_generation/make_tables.nr | 22 ------------ src/_table_generation/table_generation.md | 44 +++++++++++++++++++++++ 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/src/_table_generation/make_tables.nr b/src/_table_generation/make_tables.nr index a2a9086..b01dcba 100644 --- a/src/_table_generation/make_tables.nr +++ b/src/_table_generation/make_tables.nr @@ -46,28 +46,6 @@ global CAPTURE_ERROR_FLAG: [[bool; 128]; 4] = [ LITERAL_CAPTURE_ERROR_FLAG, ]; -unconstrained fn make_capture_table_full() -> [[Field; 128]; 4] { - let mut result: [[Field; 128]; 4] = [[0; 128]; 4]; - for i in 0..4 { - for j in 0..128 { - let table = CAPTURE_TABLE[i][j]; - let token = CAPTURE_TOKEN[i][j]; - let push_transcript = CAPTURE_PUSH_TRANSCRIPT[i][j] as Field; - let increase_length = CAPTURE_INCREASE_LENGTH[i][j] as Field; - let error = CAPTURE_ERROR_FLAG[i][j] as Field; - - let full = table - + token as Field * 0x100 - + push_transcript * 0x10000 - + increase_length * 0x1000000 - + error * 0x100000000; - result[i][j] = full; - } - } - - result -} - unconstrained fn make_ascii_to_token_table() -> [Field; 1024] { let mut result: [Field; 256 * 4] = [0; 256 * 4]; for i in 0..4 { diff --git a/src/_table_generation/table_generation.md b/src/_table_generation/table_generation.md index 2bd87fb..e81742b 100644 --- a/src/_table_generation/table_generation.md +++ b/src/_table_generation/table_generation.md @@ -23,3 +23,47 @@ Maps (escape_flag, scan_mode, ascii) to scanning actions: - `push_transcript`: Whether to add token to transcript: in grammar mode: true for all structual elements[,{,comma,},],:. In string_capture, true for ", which signals string end. In numeric/literal_capture, true for space, \t, \n, \r, ", and comma. Note the first scan will not pick up numerics or literals because we don't know when they end, so we need to rely on capture_missing_tokens function. - `increase_length`: Whether to extend current token, always false for grammar_capture, true for 0-9 in numeric capture, all characters except for " in string_capture, all letters in true, false, null in literal_capture - `is_potential_escape_sequence`: true if current token is / in string_capture mode + +## Other tables +While TOKEN_FLAGS_TABLE and JSON_CAPTURE_TABLE are the more important tables, they are built from foundational hardcoded tables in make_tables_subtables.nr: + +GRAMMAR_CAPTURE_TABLE: State transition table for grammar scan mode. Each entry specifies the next scan mode (GRAMMAR_CAPTURE, STRING_CAPTURE, NUMERIC_CAPTURE, LITERAL_CAPTURE, or ERROR_CAPTURE) based on the encountered ASCII character. For example, "f" is mapped to LITEAL_CAPTURE because it indicates we began to scan the literal false. +STRING_CAPTURE_TABLE +NUMERIC_CAPTURE_TABLE +LITERAL_CAPTURE_TABLE + +GRAMMAR_CAPTURE_TOKEN: Maps characters in grammar mode to token types. Converts ASCII characters into the appropriate JSON token types for structural elements, values, and literals. + Structural characters ({, }, [, ], ,, :) → their respective structural tokens +- Quote (") → STRING_TOKEN (start of string) +- Digits (0-9) → NUMERIC_TOKEN (start of number) +- Literal starters (f, t, n) → LITERAL_TOKEN (start of true/false/null) +- Invalid characters → NO_TOKEN or error handling +STRING_CAPTURE_TOKEN +NUMERIC_CAPTURE_TOKEN +LITERAL_CAPTURE_TOKEN + +STRING_CAPTURE_PUSH_TRANSCRIPT: Determines when to add tokens to the transcript while scanning inside a string. Only true for the closing quote ("). This signals the end of the string and triggers token creation. All other characters within the string (letters, numbers, punctuation, spaces) are false because they extend the current string token rather than creating new tokens. + +GRAMMAR_CAPTURE_PUSH_TRANSCRIPT: Determines when to add tokens to the transcript while scanning in grammar mode. True for the following characters: +- Comma (,) → true (value separator) +- Colon (:) → true (key-value separator) +- All other characters → false (including digits, quotes, and literal starters) + +NUMERIC_CAPTURE_PUSH_TRANSCRIPT: Determines when to add the current numeric token to the transcript while scanning a number. True for the following characters: +- Whitespace (space, tab, newline, carriage return) → true (end number) +- Quote (") → true (end number, followed by string) +- Comma (,) → true (end number, followed by next value) +- All other characters → false (extend current number or error) + +LITERAL_CAPTURE_PUSH_TRANSCRIPT: Determines when to add the current literal token (true/false/null) to the transcript while scanning a literal. True for any grammar character: , [ ] { } " space tab newline (This is only used in the first scan, in the second step capture_missing_tokens, we will be able to separate the literal and value separator) + +GRAMMAR_CAPTURE_INCREASE_LENGTH: Determines when to extend the current token length while scanning in grammar mode. True for Digits (0-9) -> starting numeric scan, Letters for literals (f, t, n, r, u, e, a, l, s) -> starting literal scan. For structural tokens, we don't count its length (is just 1). For string tokens, we are expecting to see a " first before seeing letters. + +STRING_CAPTURE_INCREASE_LENGTH: Determines when to extend the current string token while scanning inside a string. True for all printable characters except for Quote (ends the string) +NUMERIC_CAPTURE_INCREASE_LENGTH: True for 0-9 +LITERAL_CAPTURE_INCREASE_LENGTH: True for t,r,u,e,f,a,l,s,n + +GRAMMAR_CAPTURE_ERROR_FLAG +STRING_CAPTURE_ERROR_FLAG +NUMERIC_CAPTURE_ERROR_FLAG +LITERAL_CAPTURE_ERROR_FLAG \ No newline at end of file From 5cb2b530bcc2d977c7d0cb9ee933a70476eedea5 Mon Sep 17 00:00:00 2001 From: jialinli Date: Tue, 5 Aug 2025 00:50:42 -0700 Subject: [PATCH 04/13] lint --- src/_string_tools/slice_packed_field.nr | 2 +- src/json.nr | 31 +++++++++---------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/src/_string_tools/slice_packed_field.nr b/src/_string_tools/slice_packed_field.nr index 31348da..6c206f8 100644 --- a/src/_string_tools/slice_packed_field.nr +++ b/src/_string_tools/slice_packed_field.nr @@ -883,7 +883,7 @@ mod test { for j in 0..18 { let start_byte: u32 = text.len() - num_bytes - byte_positions[j]; let mut expected_slices: [Field; 3] = - // Safety: this is a test + // Safety: this is a test unsafe { build_slices_for_test(text, start_byte, num_bytes) }; let result_slices: [Field; 3] = slice_fields(slices, start_byte as Field, num_bytes as Field); diff --git a/src/json.nr b/src/json.nr index af8e2d4..833ef1b 100644 --- a/src/json.nr +++ b/src/json.nr @@ -308,14 +308,13 @@ impl Date: Tue, 5 Aug 2025 00:56:56 -0700 Subject: [PATCH 05/13] remove unused function lt_field_8_bit --- README.md | 37 +++++++++++++++++++++++ src/_comparison_tools/lt.nr | 12 -------- src/_table_generation/table_generation.md | 4 ++- src/json.nr | 26 +++++++++++++--- 4 files changed, 61 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index dcacbb7..088402d 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,43 @@ The parser uses several lookup tables generated from `src/_table_generation/`: - `JSON_CAPTURE_TABLE`: Character-by-character parsing rules - `TOKEN_VALIDATION_TABLE`: JSON grammar validation +### Example walkthrough +We can take a look at raw Json text {"name": "Alice", "age": 30} and how it is being parsed. +First, The parser reads the JSON one character at a time and uses lookup tables to decide what to do with each character. For {"name": "Alice"}, +Character: { → "Start scanning an object (grammar_capture)" +Character: " → "Start scanning a string" +Character: n → "Continue scanning the string" +Character: a → "Continue scanning the string" +Character: m → "Continue scanning the string" +Character: e → "Continue scanning the string" +Character: " → "End the string" +Character: : → "Key-value separator" +Character: " → "Start scanning a string" +Character: A → "Continue scanning the string" +Character: l → "Continue scanning the string" +Character: i → "Continue scanning the string" +Character: c → "Continue scanning the string" +Character: e → "Continue scanning the string" +Character: " → "End the string" +Character: } → "End the object" + +The parser builds a list of "tokens", the basic building blocks of the JSON, which becomes +1. BEGIN_OBJECT_TOKEN ({) +2. STRING_TOKEN ("name") +3. KEY_SEPARATOR_TOKEN (:) +4. STRING_TOKEN ("Alice") +5. END_OBJECT_TOKEN (}) + +The parser converts tokens into structured entries with parent-child relationships. +Each entry knows: +What type it is (object, string, number, etc.) +Who its parent is +How many children it has +Where it is in the original JSON + +Finally, the parser sorts entries by their key hashes for fast lookups. +Original order: [{"name": "Alice"}, {"age": 30}] +Sorted order: [{"age": 30}, {"name": "Alice"}] # Acknowledgements diff --git a/src/_comparison_tools/lt.nr b/src/_comparison_tools/lt.nr index 0ba5739..ab97f6e 100644 --- a/src/_comparison_tools/lt.nr +++ b/src/_comparison_tools/lt.nr @@ -54,18 +54,6 @@ pub fn lt_field_16_bit(x: Field, y: Field) -> bool { predicate } -pub fn lt_field_8_bit(x: Field, y: Field) -> bool { - // Safety: check the comments below - let predicate = unsafe { get_lt_predicate_f(x, y) }; - let delta = y as Field - x as Field; - let lt_parameter = 2 * (predicate as Field) * delta - predicate as Field - delta; - // checks that the bit length of lt_parameter is 8 - // i.e. checks the sign of lt_parameter - lt_parameter.assert_max_bit_size::<8>(); - - predicate -} - pub fn assert_gt_240_bit(lhs: Field, rhs: Field) { // lhs > rhs // -> lhs - rhs > 0 diff --git a/src/_table_generation/table_generation.md b/src/_table_generation/table_generation.md index e81742b..a3b7753 100644 --- a/src/_table_generation/table_generation.md +++ b/src/_table_generation/table_generation.md @@ -66,4 +66,6 @@ LITERAL_CAPTURE_INCREASE_LENGTH: True for t,r,u,e,f,a,l,s,n GRAMMAR_CAPTURE_ERROR_FLAG STRING_CAPTURE_ERROR_FLAG NUMERIC_CAPTURE_ERROR_FLAG -LITERAL_CAPTURE_ERROR_FLAG \ No newline at end of file +LITERAL_CAPTURE_ERROR_FLAG + +PROCESS_RAW_TRANSCRIPT_TABLE: This table is used to post-process the raw transcript and add missing grammar tokens that were not captured during the initial scanning in build_transcript. Input: encoded_ascii of the last token in each entry (scan_mode + ascii character). Output: containing: token: The token type for this entry, new_grammar: Whether to add a missing grammar token, and scan_token: The type of grammar token to add (if needed), such as END_OBJECT_TOKEN }, or VALUE_SEPARATOR_TOKEN comma. \ No newline at end of file diff --git a/src/json.nr b/src/json.nr index 833ef1b..492d80b 100644 --- a/src/json.nr +++ b/src/json.nr @@ -116,6 +116,7 @@ impl [Field; MaxNumTokens] { let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens]; let mut transcript_ptr: u32 = 0; + // We start in grammar capture mode, expecting to see a { or [. let mut scan_mode = GRAMMAR_CAPTURE; let mut length: Field = 0; let mut previous_was_potential_escape_sequence = 0; for i in 0..NumBytes { - // while this assert is in an unconstrained function, the out of bounds accesss `raw_transcript[transcript_ptr]` in build_transcript also generates failing constraints + // while this assert is in an unconstrained function, the out of bounds access `raw_transcript[transcript_ptr]` in build_transcript also generates failing constraints assert(transcript_ptr < MaxNumTokens, "build_transcript: MaxNumTokens limit exceeded!"); let ascii = self.json[i]; @@ -552,18 +556,23 @@ impl(); + scan_mode.assert_max_bit_size::<2>(); JSON { json: self.json, @@ -689,19 +699,24 @@ impl Date: Wed, 6 Aug 2025 00:36:31 -0700 Subject: [PATCH 06/13] add literal validations --- src/json.nr | 57 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/src/json.nr b/src/json.nr index 492d80b..4c1aea1 100644 --- a/src/json.nr +++ b/src/json.nr @@ -759,6 +759,32 @@ impl(); + if length == 5 { + let is_false = (self.json[index_as_u32] == 102) // 'f' + & (self.json[index_as_u32 + 1] == 97) // 'a' + & (self.json[index_as_u32 + 2] == 108) // 'l' + & (self.json[index_as_u32 + 3] == 115) // 's' + & (self.json[index_as_u32 + 4] == 101); // 'e' + assert(is_false, "invalid literal"); + } else if length == 4 { + let is_true = (self.json[index_as_u32] == 116) // 't' + & (self.json[index_as_u32 + 1] == 114) // 'r' + & (self.json[index_as_u32 + 2] == 117) // 'u' + & (self.json[index_as_u32 + 3] == 101); // 'e' + + let is_null = (self.json[index_as_u32] == 110) // 'n' + & (self.json[index_as_u32 + 1] == 117) // 'u' + & (self.json[index_as_u32 + 2] == 108) // 'l' + & (self.json[index_as_u32 + 3] == 108); // 'l' + assert(is_null | is_true, "invalid literal"); + } else { + assert(false, "invalid literal"); + } + } // 2 gates let diff = updated_transcript[cast_num_to_u32(transcript_ptr)] - entry; std::as_witness(diff); @@ -1075,13 +1101,6 @@ fn test_json_char_outside_of_string_fails() { let _: JSON<26, 10, 20, 20, 2> = JSON::parse_json_from_string(text); } -#[test(should_fail_with = "ValidationFlags: grammar error")] -fn test_json_char_outside_of_string_fails_2() { - // n could be the start of the literal "null", so this passes the ScanData check but fails ValidationFlags - let text = "{ \"hello \", \"world\" n}"; - let _: JSON<26, 10, 20, 20, 2> = JSON::parse_json_from_string(text); -} - #[test(should_fail_with = "ValidationFlags: grammar error")] fn test_json_array_with_invalid_tokens_fails() { // n could be the start of the literal "null", so this passes the ScanData check but fails ValidationFlags @@ -1133,3 +1152,27 @@ fn key_is_not_a_key() { let json_string = "{1\n:0}"; let _: JSON<26, 10, 20, 20, 2> = JSON::parse_json_from_string(json_string); } + +#[test(should_fail_with = "invalid literal")] +fn test_invalid_literal() { + let text = "{ \"name\":fal }"; + let _: JSON<153, 10, 60, 60, 2> = JSON::parse_json_from_string(text); +} + +#[test(should_fail_with = "invalid literal")] +fn test_invalid_literal_2() { + let text = "{ \"name\":treu}"; + let _: JSON<153, 10, 60, 60, 2> = JSON::parse_json_from_string(text); +} + +#[test(should_fail_with = "invalid literal")] +fn test_invalid_literal_3() { + let text = "{ \"name\":truea }"; + let _: JSON<153, 10, 60, 60, 2> = JSON::parse_json_from_string(text); +} + +#[test(should_fail_with = "invalid literal")] +fn test_invalid_literal_4() { + let text = "{ \"hello \", \"world\" n}"; + let _: JSON<26, 10, 20, 20, 2> = JSON::parse_json_from_string(text); +} From 61918a131c899b7900ce9185eacf84e5d0189299 Mon Sep 17 00:00:00 2001 From: Tom French <15848336+TomAFrench@users.noreply.github.com> Date: Wed, 6 Aug 2025 18:44:51 +0000 Subject: [PATCH 07/13] chore: fix broken merge --- src/json.nr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json.nr b/src/json.nr index 43e218e..f4930a4 100644 --- a/src/json.nr +++ b/src/json.nr @@ -326,7 +326,7 @@ impl Date: Wed, 6 Aug 2025 11:45:56 -0700 Subject: [PATCH 08/13] merge conflict --- src/json.nr | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/json.nr b/src/json.nr index 43e218e..6607024 100644 --- a/src/json.nr +++ b/src/json.nr @@ -326,7 +326,6 @@ impl Date: Wed, 6 Aug 2025 12:45:22 -0700 Subject: [PATCH 09/13] remove unnecessary code --- src/json.nr | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/json.nr b/src/json.nr index 3d94ba3..7634e0b 100644 --- a/src/json.nr +++ b/src/json.nr @@ -361,10 +361,7 @@ impl Date: Thu, 7 Aug 2025 18:44:39 -0700 Subject: [PATCH 10/13] merge conflict --- src/json.nr | 4 ++-- src/keymap.nr | 27 --------------------------- 2 files changed, 2 insertions(+), 29 deletions(-) diff --git a/src/json.nr b/src/json.nr index 20923c4..43d21f2 100644 --- a/src/json.nr +++ b/src/json.nr @@ -126,7 +126,7 @@ impl>>>>>> 6d190ca4391e4682a576af7666a99885a39ada4b for i in 0..MaxNumValues { // 11.75 + 3.5 = 15.25 gates per iteration let (id, parent_index, entry_type) = JSONEntry::extract_entry_type_id_and_parent_index_from_field( sorted_entries[i].value, ); -<<<<<<< HEAD - parent_indices[i] = parent_index; -======= parent_indices[i] = cast_num_to_u32(parent_index); ->>>>>>> 6d190ca4391e4682a576af7666a99885a39ada4b // 2 gates // update is 1 for end of object/array, 0 for other let update = TOKEN_ENDS_OBJECT_OR_ARRAY[cast_num_to_u32(entry_type)]; // NOTE THIS RELIES ON MaxNumValues ACTUALLY DESCRIBING NUMMaxNumValues + 1 -<<<<<<< HEAD - // index = id if update = 1, else MaxNumValues -1 - // 1 gate - let index = (id - (MaxNumValues as Field - 1)) * update + (MaxNumValues as Field - 1); -======= let index = if update { cast_num_to_u32(id) } else { MaxNumValues - 1 }; ->>>>>>> 6d190ca4391e4682a576af7666a99885a39ada4b // 3.5 gates identity_to_json_map[index] = i; } @@ -121,16 +105,9 @@ impl parent_identity_pre as u32) as Field; - let new_parent = lt_field_16_bit(parent_identity_pre, parent_identity_post) as Field; -======= // n.b. parent_identity_post - parent_identity_pre is not necessarily 0 or 1 (can be larger) // due to empty objects and arrays increasing identity value without creating associated child json entries let new_parent = parent_identity_pre < parent_identity_post; ->>>>>>> 6d190ca4391e4682a576af7666a99885a39ada4b // 3.5 gates let index_of_parent = identity_to_json_map[parent_identity_post]; // 1 gate + 3.5 gates @@ -144,11 +121,7 @@ impl>>>>>> 6d190ca4391e4682a576af7666a99885a39ada4b parent_identity_pre = parent_identity_post; } From 78c67ffc3367de037996cc8e701c6f4952f88e35 Mon Sep 17 00:00:00 2001 From: jialinli Date: Thu, 7 Aug 2025 19:15:54 -0700 Subject: [PATCH 11/13] format --- src/json.nr | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/json.nr b/src/json.nr index 43d21f2..6beb106 100644 --- a/src/json.nr +++ b/src/json.nr @@ -643,8 +643,9 @@ impl Date: Thu, 7 Aug 2025 19:15:54 -0700 Subject: [PATCH 12/13] format --- src/json.nr | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/json.nr b/src/json.nr index 43d21f2..7471f02 100644 --- a/src/json.nr +++ b/src/json.nr @@ -431,10 +431,10 @@ impl `preserve_num_entries = 0` (start new object/array) + // - When `is_end_of_object_or_array = 1` -> `preserve_num_entries = 0` (end object/array) + // - When `preserve_num_entries = 1` -> both flags = 0 (normal token) + // // 4 gates { let old = current_identity_value; @@ -643,8 +648,9 @@ impl Date: Wed, 13 Aug 2025 09:09:07 -0700 Subject: [PATCH 13/13] lint --- src/json.nr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json.nr b/src/json.nr index d66d50b..754165e 100644 --- a/src/json.nr +++ b/src/json.nr @@ -449,7 +449,7 @@ impl `preserve_num_entries = 0` (start new object/array) - // - When `is_end_of_object_or_array = 1` -> `preserve_num_entries = 0` (end object/array) + // - When `is_end_of_object_or_array = 1` -> `preserve_num_entries = 0` (end object/array) // - When `preserve_num_entries = 1` -> both flags = 0 (normal token) // // 4 gates