Skip to content

Commit b494199

Browse files
authored
chore: remove scan_mode + remove optimization + add table generation tests (#59)
1 parent dc8ad05 commit b494199

File tree

7 files changed

+74
-66
lines changed

7 files changed

+74
-66
lines changed

src/_table_generation/make_tables.nr

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
//! Contains methods used to generate tables in `json_tables.nr`. These table generation methods shouldn't be used inside of actual circuits.
2-
3-
pub(crate) mod CaptureMode {
4-
pub(crate) global GRAMMAR_CAPTURE: Field = 0;
5-
pub(crate) global STRING_CAPTURE: Field = 1;
6-
pub(crate) global NUMERIC_CAPTURE: Field = 2;
7-
pub(crate) global LITERAL_CAPTURE: Field = 3;
8-
pub(crate) global ERROR_CAPTURE: Field = 4;
9-
}
2+
use crate::enums::CaptureMode::STRING_CAPTURE;
103
use crate::enums::Layer::{ARRAY_LAYER, OBJECT_LAYER, SINGLE_VALUE_LAYER};
114
use crate::enums::Token::{
125
BEGIN_ARRAY_TOKEN, BEGIN_OBJECT_TOKEN, END_ARRAY_TOKEN, END_OBJECT_TOKEN, KEY_SEPARATOR_TOKEN,
136
KEY_TOKEN, LITERAL_TOKEN, NO_TOKEN, NUM_TOKENS, NUMERIC_TOKEN, STRING_TOKEN,
147
VALUE_SEPARATOR_TOKEN,
158
};
9+
use crate::json_tables::ASCII_TO_TOKEN_TABLE;
10+
use crate::json_tables::JSON_CAPTURE_TABLE;
11+
use crate::json_tables::PROCESS_RAW_TRANSCRIPT_TABLE;
12+
use crate::json_tables::TOKEN_FLAGS_TABLE;
13+
use crate::json_tables::TOKEN_VALIDATION_TABLE;
1614
use crate::token_flags::TokenFlags;
1715
use crate::transcript_entry::ValidationFlags;
1816
use super::make_tables_subtables::{
@@ -24,7 +22,6 @@ use super::make_tables_subtables::{
2422
STRING_CAPTURE_ERROR_FLAG, STRING_CAPTURE_INCREASE_LENGTH, STRING_CAPTURE_PUSH_TRANSCRIPT,
2523
STRING_CAPTURE_TABLE, STRING_CAPTURE_TOKEN, TOKEN_IS_NUMERIC_OR_LITERAL,
2624
};
27-
use CaptureMode::STRING_CAPTURE;
2825

2926
global CAPTURE_TABLE: [[Field; 128]; 4] =
3027
[GRAMMAR_CAPTURE_TABLE, STRING_CAPTURE_TABLE, NUMERIC_CAPTURE_TABLE, LITERAL_CAPTURE_TABLE];
@@ -324,16 +321,6 @@ unconstrained fn make_token_validation_table() -> [Field; NUM_TOKENS * NUM_TOKEN
324321
flattened_flags
325322
}
326323

327-
// #[test]
328-
// fn test_make_validation_flags() {
329-
// let f = make_token_validation_table();
330-
// println(f"global TOKEN_VALIDATION_TABLE: [Field; 363] = {f};");
331-
// }
332-
// #[test]
333-
// fn test_make_ascii_to_token_table() {
334-
// let r = make_ascii_to_token_table();
335-
// println(f"table = {r}");
336-
// }
337324
unconstrained fn make_json_capture_table() -> [Field; 2048] {
338325
let backslash: u32 = "\\".as_bytes()[0] as u32;
339326
let quotes: u32 = "\"".as_bytes()[0] as u32;
@@ -373,11 +360,6 @@ unconstrained fn make_json_capture_table() -> [Field; 2048] {
373360

374361
result
375362
}
376-
// #[test]
377-
// fn test_make_JSON_CAPTURE_TABLE() {
378-
// let r = make_JSON_CAPTURE_TABLE();
379-
// println(f"table = {r}");
380-
// }
381363

382364
unconstrained fn make_process_raw_transcript_table() -> [Field; 1024] {
383365
let mut result: [Field; 1024] = [0; 1024];
@@ -397,11 +379,6 @@ unconstrained fn make_process_raw_transcript_table() -> [Field; 1024] {
397379
}
398380
result
399381
}
400-
// #[test]
401-
// fn test_make_process_raw_transcript_table() {
402-
// let r = make_process_raw_transcript_table();
403-
// println(f"table = {r}");
404-
// }
405382

406383
unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
407384
let mut flags: [TokenFlags; NUM_TOKENS * 2] = [TokenFlags::default(); NUM_TOKENS * 2];
@@ -531,14 +508,41 @@ unconstrained fn generate_token_flags_table() -> [Field; NUM_TOKENS * 2] {
531508
result
532509
}
533510

534-
// #[test]
535-
// fn test_generate_token_flags_table() {
536-
// let r = generate_token_flags_table();
537-
// println(f"global TOKEN_FLAGS_TABLE: [Field; NUM_TOKENS_MUL_2] = {r};");
538-
// }
539-
540-
// #[test]
541-
// fn test_make_json_capture_table() {
542-
// let r = make_json_capture_table();
543-
// println(f"global JSON_CAPTURE_TABLE: [Field; 2048] = {r};");
544-
// }
511+
#[test]
512+
fn test_generate_token_flags_table() {
513+
unsafe {
514+
let r = generate_token_flags_table();
515+
assert(r == TOKEN_FLAGS_TABLE);
516+
}
517+
}
518+
519+
#[test]
520+
fn test_make_json_capture_table() {
521+
unsafe {
522+
let r = make_json_capture_table();
523+
assert(r == JSON_CAPTURE_TABLE);
524+
}
525+
}
526+
527+
#[test]
528+
fn test_make_validation_flags() {
529+
unsafe {
530+
let f = make_token_validation_table();
531+
assert(f == TOKEN_VALIDATION_TABLE);
532+
}
533+
}
534+
#[test]
535+
fn test_make_ascii_to_token_table() {
536+
unsafe {
537+
let r = make_ascii_to_token_table();
538+
assert(r == ASCII_TO_TOKEN_TABLE);
539+
}
540+
}
541+
542+
#[test]
543+
fn test_make_process_raw_transcript_table() {
544+
unsafe {
545+
let r = make_process_raw_transcript_table();
546+
assert(r == PROCESS_RAW_TRANSCRIPT_TABLE);
547+
}
548+
}

src/_table_generation/make_tables_subtables.nr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::_table_generation::make_tables::CaptureMode::{
1+
use crate::enums::CaptureMode::{
22
ERROR_CAPTURE, GRAMMAR_CAPTURE, LITERAL_CAPTURE, NUMERIC_CAPTURE, STRING_CAPTURE,
33
};
44
use crate::enums::Token::{

src/benchmarks/benchmarks.nr

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ use crate::{JSON16kb, JSON512b, json::JSON};
33
comptime fn make_bench(m: Module, params: Quoted) -> Quoted {
44
let module_name = m.name();
55
let parse_json_from_string = f"parse_json_from_string_{module_name}".quoted_contents();
6-
let parse_json = f"parse_json_{module_name}".quoted_contents();
76
let get_array = f"get_array_{module_name}".quoted_contents();
87
let get_object = f"get_object_{module_name}".quoted_contents();
98
let get_literal = f"get_literal_{module_name}".quoted_contents();

src/enums.nr

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1-
pub(crate) mod ScanMode {
2-
pub(crate) global GRAMMAR_SCAN: Field = 0;
1+
pub(crate) mod CaptureMode {
2+
pub(crate) global GRAMMAR_CAPTURE: Field = 0;
3+
pub(crate) global STRING_CAPTURE: Field = 1;
4+
pub(crate) global NUMERIC_CAPTURE: Field = 2;
5+
pub(crate) global LITERAL_CAPTURE: Field = 3;
6+
pub(crate) global ERROR_CAPTURE: Field = 4;
37
}
48

59
pub(crate) mod Token {

src/json.nr

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::_comparison_tools::bounds_checker::get_validity_flags;
2+
use crate::enums::CaptureMode::GRAMMAR_CAPTURE;
23
use crate::enums::Layer::{ARRAY_LAYER, OBJECT_LAYER, SINGLE_VALUE_LAYER};
3-
use crate::enums::ScanMode::GRAMMAR_SCAN;
44
use crate::enums::Token::{
55
BEGIN_ARRAY_TOKEN, BEGIN_OBJECT_TOKEN, KEY_SEPARATOR_TOKEN, KEY_TOKEN, LITERAL_TOKEN,
66
NUM_TOKENS, NUMERIC_TOKEN, STRING_TOKEN,
@@ -77,8 +77,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
7777
}
7878
}
7979

80-
// TODO: casting entry_ptr to u16 is kind of expensive when generating witnesses, can we fix?
8180
unconstrained fn __check_entry_ptr_bounds(entry_ptr: Field, max: u32) {
81+
entry_ptr.assert_max_bit_size::<32>();
8282
// n.b. even though this assert is in an unconstrained function, an out of bounds error will be triggered when writing into self.key_data[entry_ptr]
8383
assert(entry_ptr as u32 < max - 1, "create_json_entries: MaxNumValues limit exceeded!");
8484
}
@@ -409,7 +409,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
409409
unconstrained fn __build_transcript(self) -> [Field; MaxNumTokens] {
410410
let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens];
411411
let mut transcript_ptr: u32 = 0;
412-
let mut scan_mode = GRAMMAR_SCAN as Field;
412+
let mut scan_mode = GRAMMAR_CAPTURE;
413413
let mut length: Field = 0;
414414
let mut previous_was_potential_escape_sequence = 0;
415415
for i in 0..NumBytes {
@@ -421,17 +421,18 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
421421
previous_was_potential_escape_sequence * 1024 + scan_mode * 256 + ascii as Field;
422422
let ScanData { scan_token, push_transcript, increase_length, is_potential_escape_sequence } =
423423
ScanData::from_field(JSON_CAPTURE_TABLE[cast_num_to_u32(encoded_ascii)]);
424-
let mut push_transcript = push_transcript;
425-
let mut scan_token = scan_token;
426-
let mut increase_length = increase_length;
427424

428-
let new_entry = RawTranscriptEntry::to_field(
429-
RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
430-
);
425+
if push_transcript == 1 {
426+
let new_entry = RawTranscriptEntry::to_field(
427+
RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
428+
);
431429

432-
raw_transcript[transcript_ptr] = new_entry;
433-
length = length * (1 - push_transcript) + increase_length;
434-
transcript_ptr += (push_transcript != 0) as u32;
430+
raw_transcript[transcript_ptr] = new_entry;
431+
transcript_ptr += 1;
432+
length = increase_length;
433+
} else {
434+
length += increase_length;
435+
}
435436

436437
previous_was_potential_escape_sequence = is_potential_escape_sequence;
437438

@@ -441,7 +442,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
441442
// if we end in a scan mode where we're searching for a number, string or a literal (true/false/null), we have an incomplete token and this is invalid JSON
442443
// NOTE: if we upgrade this parser to be able to process single-value JSON (e,g, "999" or ""hello" : "world"" this logic needs to be upgraded)
443444
assert(
444-
scan_mode == GRAMMAR_SCAN as Field,
445+
scan_mode == GRAMMAR_CAPTURE as Field,
445446
"build_transcript: incomplete token (number, string or literal)",
446447
);
447448

@@ -455,28 +456,28 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
455456
* @details JSON_CAPTURE_TABLE takes the following as input:
456457
* 1. the ascii byte at the current location in the json
457458
* 2. the current scan mode (are we searching for grammar, strings, numbers or literals?)
458-
* 3. could this byte potentially be an escape sequence? (i.e. the previous byte was a backslash character "\" and scan_mode == STRING_SCAN)
459+
* 3. could this byte potentially be an escape sequence? (i.e. the previous byte was a backslash character "\" and scan_mode == STRING_CAPTURE)
459460
* The table outputs the following flags:
460461
* 1. what token have we scanned? (listed in enums::Token)
461462
* 2. should we push this token to the transcript (no push if token == NO_TOKEN)
462463
* 3. should we increase the length of the current entry we're evaluating?
463-
* (i.e. if token == STRING_TOKEN and scan_mode == STRING_SCAN, then increase the length because we're in the process of scanning a string)
464-
* 4. is this scanned ascii character a potential escape sequence? i.e. scan_mode == STRING_SCAN and ascii = "\"
464+
* (i.e. if token == STRING_TOKEN and scan_mode == STRING_CAPTURE, then increase the length because we're in the process of scanning a string)
465+
* 4. is this scanned ascii character a potential escape sequence? i.e. scan_mode == STRING_CAPTURE and ascii = "\"
465466
* 5. have we entered an error state? (i.e. invalid grammar e.g. ":" is followed by "}")
466467
*
467468
* NOTE: we represent error states in a nonstandard way to reduce gate count. Instead of handling an error flag,
468469
* an error state will increase the value of `scan_token` by 0x100000000. This will cause the next access into `JSON_CAPTURE_TABLE` to trigger an out of bounds error
469470
*
470471
* NOTE: the scanned transcript will be missing some edge cases that are caught via `swap_keys` and `capture_missing_tokens`:
471-
* 1. If the scan mode is NUMERIC_SCAN or LITERAL_SCAN and the next character is a "," or "}" or "]",
472+
* 1. If the scan mode is NUMERIC_CAPTURE or LITERAL_CAPTURE and the next character is a "," or "}" or "]",
472473
* we will push a NUMERIC_TOKEN or LITERAL_TOKEN into the transcript but we will MISS the VALUE_SEPARATOR_TOKEN, END_OBJECT_TOKEN or END_ARRAY_TOKEN
473474
* (accomodating this edge case requires conditionally pushing two transcript entries per iteration, so we do this in a separate step where we iterate over the transcript and not the json bytes)
474475
* 2. We can't yet tell if an entry is a KEY_TOKEN or a STRING_TOKEN. All keys are represented as STRING_TOKEN. This gets fixed after `swap_keys` is evaluated
475476
**/
476477
fn build_transcript(self) -> Self {
477478
let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens];
478479
let mut transcript_ptr: Field = 0;
479-
let mut scan_mode = GRAMMAR_SCAN;
480+
let mut scan_mode = GRAMMAR_CAPTURE;
480481
let mut length: Field = 0;
481482

482483
// Safety: check the comments below
@@ -507,6 +508,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
507508
RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
508509
);
509510
std::as_witness(diff);
511+
510512
// 1 gate
511513
assert(diff * push_transcript == 0);
512514

src/json_tables.nr

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2255,10 +2255,11 @@ pub(crate) global TOKEN_VALIDATION_TABLE: [Field; 363] = [
22552255
0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
22562256
0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
22572257
0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
2258+
0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000,
22582259
0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
2259-
0x01000000, 0x01000000, 0x01000000, 0x00, 0x01000000, 0x01000000, 0x01000000, 0x01000000,
22602260
0x01000000,
22612261
];
2262+
22622263
pub(crate) global ASCII_TO_NUMBER: [u8; 128] = [
22632264
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22642265
/* */ 0, /*"!"*/ 0, /* " */ 0, /*"#"*/ 0, /*"$"*/ 0, /*"%"*/ 0, /*"&"*/ 0, /*"'"*/ 0, /*"("*/ 0,

src/transcript_entry.nr

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,12 @@ impl ScanData {
116116
unconstrained fn __from_field(f: Field) -> Self {
117117
let bytes: [u8; 6] = f.to_le_bytes();
118118

119-
let mut scan_token = bytes[0] as Field;
119+
let scan_token: Field = bytes[0] as Field;
120120
let push_transcript = bytes[1] as Field;
121121
let increase_length = bytes[2] as Field;
122122
let is_potential_escape_sequence = bytes[3] as Field;
123123
let error = bytes[4] as Field * 0x100 + bytes[5] as Field;
124124
assert(error == 0, "ScanData: Invalid token");
125-
// TODO document this
126-
scan_token = scan_token + error * 0x100000000;
127125
ScanData { scan_token, push_transcript, increase_length, is_potential_escape_sequence }
128126
}
129127

0 commit comments

Comments
 (0)