chore: remove scan_mode + remove optimization + add table generation tests (#59)

jialinli98 · web-flow · commit b494199b8913 · 2025-07-31T10:48:08.000-07:00
diff --git a/src/_table_generation/make_tables.nr b/src/_table_generation/make_tables.nr
@@ -1,18 +1,16 @@
 //! Contains methods used to generate tables in `json_tables.nr`. These table generation methods shouldn't be used inside of actual circuits.
-
-pub(crate) mod CaptureMode {
-    pub(crate) global GRAMMAR_CAPTURE: Field = 0;
-    pub(crate) global STRING_CAPTURE: Field = 1;
-    pub(crate) global NUMERIC_CAPTURE: Field = 2;
-    pub(crate) global LITERAL_CAPTURE: Field = 3;
-    pub(crate) global ERROR_CAPTURE: Field = 4;
-}
+use crate::enums::CaptureMode::STRING_CAPTURE;
 use crate::enums::Layer::{ARRAY_LAYER, OBJECT_LAYER, SINGLE_VALUE_LAYER};
 use crate::enums::Token::{
     BEGIN_ARRAY_TOKEN, BEGIN_OBJECT_TOKEN, END_ARRAY_TOKEN, END_OBJECT_TOKEN, KEY_SEPARATOR_TOKEN,
     KEY_TOKEN, LITERAL_TOKEN, NO_TOKEN, NUM_TOKENS, NUMERIC_TOKEN, STRING_TOKEN,
     VALUE_SEPARATOR_TOKEN,
 };
+use crate::json_tables::ASCII_TO_TOKEN_TABLE;
+use crate::json_tables::JSON_CAPTURE_TABLE;
+use crate::json_tables::PROCESS_RAW_TRANSCRIPT_TABLE;
+use crate::json_tables::TOKEN_FLAGS_TABLE;
+use crate::json_tables::TOKEN_VALIDATION_TABLE;
 use crate::token_flags::TokenFlags;
 use crate::transcript_entry::ValidationFlags;
 use super::make_tables_subtables::{
@@ -24,7 +22,6 @@ use super::make_tables_subtables::{
     STRING_CAPTURE_ERROR_FLAG, STRING_CAPTURE_INCREASE_LENGTH, STRING_CAPTURE_PUSH_TRANSCRIPT,
     STRING_CAPTURE_TABLE, STRING_CAPTURE_TOKEN, TOKEN_IS_NUMERIC_OR_LITERAL,
 };
-use CaptureMode::STRING_CAPTURE;
 
 global CAPTURE_TABLE: [[Field; 128]; 4] =
     [GRAMMAR_CAPTURE_TABLE, STRING_CAPTURE_TABLE, NUMERIC_CAPTURE_TABLE, LITERAL_CAPTURE_TABLE];
@@ -324,16 +321,6 @@ unconstrained fn make_token_validation_table() -> [Field; NUM_TOKENS * NUM_TOKEN
     flattened_flags
 }
 
-// #[test]
-// fn test_make_validation_flags() {
-//     let f = make_token_validation_table();
-//     println(f"global TOKEN_VALIDATION_TABLE: [Field; 363] = {f};");
-// }
-// #[test]
-// fn test_make_ascii_to_token_table() {
-//     let r = make_ascii_to_token_table();
-//     println(f"table = {r}");
-// }
 unconstrained fn make_json_capture_table() -> [Field; 2048] {
     let backslash: u32 = "\\".as_bytes()[0] as u32;
     let quotes: u32 = "\"".as_bytes()[0] as u32;
@@ -373,11 +360,6 @@ unconstrained fn make_json_capture_table() -> [Field; 2048] {
 
     result
 }
-// #[test]
-// fn test_make_JSON_CAPTURE_TABLE() {
-//     let r = make_JSON_CAPTURE_TABLE();
-//     println(f"table = {r}");
-// }
 
 unconstrained fn make_process_raw_transcript_table() -> [Field; 1024] {
     let mut result: [Field; 1024] = [0; 1024];
@@ -397,11 +379,6 @@ unconstrained fn make_process_raw_transcript_table() -> [Field; 1024] {
     }
     result
 }
-// #[test]
-// fn test_make_process_raw_transcript_table() {
-//     let r = make_process_raw_transcript_table();
-//     println(f"table = {r}");
-// }
 
 unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     let mut flags: [TokenFlags; NUM_TOKENS * 2] = [TokenFlags::default(); NUM_TOKENS * 2];
@@ -531,14 +508,41 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
     result
 }
 
-// #[test]
-// fn test_generate_token_flags_table() {
-//     let r = generate_token_flags_table();
-//     println(f"global TOKEN_FLAGS_TABLE: [Field; NUM_TOKENS_MUL_2] = {r};");
-// }
-
-// #[test]
-// fn test_make_json_capture_table() {
-//     let r = make_json_capture_table();
-//     println(f"global JSON_CAPTURE_TABLE: [Field; 2048] = {r};");
-// }
+#[test]
+fn test_generate_token_flags_table() {
+    unsafe {
+        let r = generate_token_flags_table();
+        assert(r == TOKEN_FLAGS_TABLE);
+    }
+}
+
+#[test]
+fn test_make_json_capture_table() {
+    unsafe {
+        let r = make_json_capture_table();
+        assert(r == JSON_CAPTURE_TABLE);
+    }
+}
+
+#[test]
+fn test_make_validation_flags() {
+    unsafe {
+        let f = make_token_validation_table();
+        assert(f == TOKEN_VALIDATION_TABLE);
+    }
+}
+#[test]
+fn test_make_ascii_to_token_table() {
+    unsafe {
+        let r = make_ascii_to_token_table();
+        assert(r == ASCII_TO_TOKEN_TABLE);
+    }
+}
+
+#[test]
+fn test_make_process_raw_transcript_table() {
+    unsafe {
+        let r = make_process_raw_transcript_table();
+        assert(r == PROCESS_RAW_TRANSCRIPT_TABLE);
+    }
+}
diff --git a/src/_table_generation/make_tables_subtables.nr b/src/_table_generation/make_tables_subtables.nr
@@ -1,4 +1,4 @@
-use crate::_table_generation::make_tables::CaptureMode::{
+use crate::enums::CaptureMode::{
     ERROR_CAPTURE, GRAMMAR_CAPTURE, LITERAL_CAPTURE, NUMERIC_CAPTURE, STRING_CAPTURE,
 };
 use crate::enums::Token::{
diff --git a/src/benchmarks/benchmarks.nr b/src/benchmarks/benchmarks.nr
@@ -3,7 +3,6 @@ use crate::{JSON16kb, JSON512b, json::JSON};
 comptime fn make_bench(m: Module, params: Quoted) -> Quoted {
     let module_name = m.name();
     let parse_json_from_string = f"parse_json_from_string_{module_name}".quoted_contents();
-    let parse_json = f"parse_json_{module_name}".quoted_contents();
     let get_array = f"get_array_{module_name}".quoted_contents();
     let get_object = f"get_object_{module_name}".quoted_contents();
     let get_literal = f"get_literal_{module_name}".quoted_contents();
diff --git a/src/enums.nr b/src/enums.nr
@@ -1,5 +1,9 @@
-pub(crate) mod ScanMode {
-    pub(crate) global GRAMMAR_SCAN: Field = 0;
+pub(crate) mod CaptureMode {
+    pub(crate) global GRAMMAR_CAPTURE: Field = 0;
+    pub(crate) global STRING_CAPTURE: Field = 1;
+    pub(crate) global NUMERIC_CAPTURE: Field = 2;
+    pub(crate) global LITERAL_CAPTURE: Field = 3;
+    pub(crate) global ERROR_CAPTURE: Field = 4;
 }
 
 pub(crate) mod Token {
diff --git a/src/json.nr b/src/json.nr
@@ -1,6 +1,6 @@
 use crate::_comparison_tools::bounds_checker::get_validity_flags;
+use crate::enums::CaptureMode::GRAMMAR_CAPTURE;
 use crate::enums::Layer::{ARRAY_LAYER, OBJECT_LAYER, SINGLE_VALUE_LAYER};
-use crate::enums::ScanMode::GRAMMAR_SCAN;
 use crate::enums::Token::{
     BEGIN_ARRAY_TOKEN, BEGIN_OBJECT_TOKEN, KEY_SEPARATOR_TOKEN, KEY_TOKEN, LITERAL_TOKEN,
     NUM_TOKENS, NUMERIC_TOKEN, STRING_TOKEN,
@@ -77,8 +77,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
     }
 }
 
-// TODO: casting entry_ptr to u16 is kind of expensive when generating witnesses, can we fix?
 unconstrained fn __check_entry_ptr_bounds(entry_ptr: Field, max: u32) {
+    entry_ptr.assert_max_bit_size::<32>();
     // n.b. even though this assert is in an unconstrained function, an out of bounds error will be triggered when writing into self.key_data[entry_ptr]
     assert(entry_ptr as u32 < max - 1, "create_json_entries: MaxNumValues limit exceeded!");
 }
@@ -409,7 +409,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
     unconstrained fn __build_transcript(self) -> [Field; MaxNumTokens] {
         let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens];
         let mut transcript_ptr: u32 = 0;
-        let mut scan_mode = GRAMMAR_SCAN as Field;
+        let mut scan_mode = GRAMMAR_CAPTURE;
         let mut length: Field = 0;
         let mut previous_was_potential_escape_sequence = 0;
         for i in 0..NumBytes {
@@ -421,17 +421,18 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
                 previous_was_potential_escape_sequence * 1024 + scan_mode * 256 + ascii as Field;
             let ScanData { scan_token, push_transcript, increase_length, is_potential_escape_sequence } =
                 ScanData::from_field(JSON_CAPTURE_TABLE[cast_num_to_u32(encoded_ascii)]);
-            let mut push_transcript = push_transcript;
-            let mut scan_token = scan_token;
-            let mut increase_length = increase_length;
 
-            let new_entry = RawTranscriptEntry::to_field(
-                RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
-            );
+            if push_transcript == 1 {
+                let new_entry = RawTranscriptEntry::to_field(
+                    RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
+                );
 
-            raw_transcript[transcript_ptr] = new_entry;
-            length = length * (1 - push_transcript) + increase_length;
-            transcript_ptr += (push_transcript != 0) as u32;
+                raw_transcript[transcript_ptr] = new_entry;
+                transcript_ptr += 1;
+                length = increase_length;
+            } else {
+                length += increase_length;
+            }
 
             previous_was_potential_escape_sequence = is_potential_escape_sequence;
 
@@ -441,7 +442,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
         // if we end in a scan mode where we're searching for a number, string or a literal (true/false/null), we have an incomplete token and this is invalid JSON
         // NOTE: if we upgrade this parser to be able to process single-value JSON (e,g, "999" or ""hello" : "world"" this logic needs to be upgraded)
         assert(
-            scan_mode == GRAMMAR_SCAN as Field,
+            scan_mode == GRAMMAR_CAPTURE as Field,
             "build_transcript: incomplete token (number, string or literal)",
         );
 
@@ -455,28 +456,28 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
      * @details JSON_CAPTURE_TABLE takes the following as input:
      *          1. the ascii byte at the current location in the json
      *          2. the current scan mode (are we searching for grammar, strings, numbers or literals?)
-     *          3. could this byte potentially be an escape sequence? (i.e. the previous byte was a backslash character "\" and scan_mode == STRING_SCAN)
+     *          3. could this byte potentially be an escape sequence? (i.e. the previous byte was a backslash character "\" and scan_mode == STRING_CAPTURE)
      *          The table outputs the following flags:
      *          1. what token have we scanned? (listed in enums::Token)
      *          2. should we push this token to the transcript (no push if token == NO_TOKEN)
      *          3. should we increase the length of the current entry we're evaluating?
-     *              (i.e. if token == STRING_TOKEN and scan_mode == STRING_SCAN, then increase the length because we're in the process of scanning a string)
-     *          4. is this scanned ascii character a potential escape sequence? i.e. scan_mode == STRING_SCAN and ascii = "\"
+     *              (i.e. if token == STRING_TOKEN and scan_mode == STRING_CAPTURE, then increase the length because we're in the process of scanning a string)
+     *          4. is this scanned ascii character a potential escape sequence? i.e. scan_mode == STRING_CAPTURE and ascii = "\"
      *          5. have we entered an error state? (i.e. invalid grammar e.g. ":" is followed by "}")
      *
      * NOTE: we represent error states in a nonstandard way to reduce gate count. Instead of handling an error flag,
      *       an error state will increase the value of `scan_token` by 0x100000000. This will cause the next access into `JSON_CAPTURE_TABLE` to trigger an out of bounds error
      *
      * NOTE: the scanned transcript will be missing some edge cases that are caught via `swap_keys` and `capture_missing_tokens`:
-     *          1. If the scan mode is NUMERIC_SCAN or LITERAL_SCAN and the next character is a "," or "}" or "]",
+     *          1. If the scan mode is NUMERIC_CAPTURE or LITERAL_CAPTURE and the next character is a "," or "}" or "]",
      *             we will push a NUMERIC_TOKEN or LITERAL_TOKEN into the transcript but we will MISS the VALUE_SEPARATOR_TOKEN, END_OBJECT_TOKEN or END_ARRAY_TOKEN
      *             (accomodating this edge case requires conditionally pushing two transcript entries per iteration, so we do this in a separate step where we iterate over the transcript and not the json bytes)
      *          2. We can't yet tell if an entry is a KEY_TOKEN or a STRING_TOKEN. All keys are represented as STRING_TOKEN. This gets fixed after `swap_keys` is evaluated
      **/
     fn build_transcript(self) -> Self {
         let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens];
         let mut transcript_ptr: Field = 0;
-        let mut scan_mode = GRAMMAR_SCAN;
+        let mut scan_mode = GRAMMAR_CAPTURE;
         let mut length: Field = 0;
 
         // Safety: check the comments below
@@ -507,6 +508,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
                     RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
                 );
             std::as_witness(diff);
+
             // 1 gate
             assert(diff * push_transcript == 0);
 
diff --git a/src/json_tables.nr b/src/json_tables.nr
@@ -2255,10 +2255,11 @@ pub(crate) global TOKEN_VALIDATION_TABLE: [Field; 363] = [
     0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
     0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
     0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
+    0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000,
     0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
-    0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
     0x01000000,
 ];
+
 pub(crate) global ASCII_TO_NUMBER: [u8; 128] = [
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     /*   */ 0, /*"!"*/ 0, /* " */ 0, /*"#"*/ 0, /*"$"*/ 0, /*"%"*/ 0, /*"&"*/ 0, /*"'"*/ 0, /*"("*/ 0,
diff --git a/src/transcript_entry.nr b/src/transcript_entry.nr
@@ -116,14 +116,12 @@ impl ScanData {
     unconstrained fn __from_field(f: Field) -> Self {
         let bytes: [u8; 6] = f.to_le_bytes();
 
-        let mut scan_token = bytes[0] as Field;
+        let scan_token: Field = bytes[0] as Field;
         let push_transcript = bytes[1] as Field;
         let increase_length = bytes[2] as Field;
         let is_potential_escape_sequence = bytes[3] as Field;
         let error = bytes[4] as Field * 0x100 + bytes[5] as Field;
         assert(error == 0, "ScanData: Invalid token");
-        // TODO document this
-        scan_token = scan_token + error * 0x100000000;
         ScanData { scan_token, push_transcript, increase_length, is_potential_escape_sequence }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-use crate::_table_generation::make_tables::CaptureMode::{`
	`1`	`+use crate::enums::CaptureMode::{`
`2`	`2`	`ERROR_CAPTURE, GRAMMAR_CAPTURE, LITERAL_CAPTURE, NUMERIC_CAPTURE, STRING_CAPTURE,`
`3`	`3`	`};`
`4`	`4`	`use crate::enums::Token::{`