Skip to content

Commit 7dc7bda

Browse files
committed
remove unused function lt_field_8_bit
1 parent 5cb2b53 commit 7dc7bda

File tree

4 files changed

+55
-19
lines changed

4 files changed

+55
-19
lines changed

README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,43 @@ The parser uses several lookup tables generated from `src/_table_generation/`:
142142
- `JSON_CAPTURE_TABLE`: Character-by-character parsing rules
143143
- `TOKEN_VALIDATION_TABLE`: JSON grammar validation
144144

145+
### Example walkthrough
146+
We can take a look at raw Json text {"name": "Alice", "age": 30} and how it is being parsed.
147+
First, The parser reads the JSON one character at a time and uses lookup tables to decide what to do with each character. For {"name": "Alice"},
148+
Character: { → "Start scanning an object (grammar_capture)"
149+
Character: " → "Start scanning a string"
150+
Character: n → "Continue scanning the string"
151+
Character: a → "Continue scanning the string"
152+
Character: m → "Continue scanning the string"
153+
Character: e → "Continue scanning the string"
154+
Character: " → "End the string"
155+
Character: : → "Key-value separator"
156+
Character: " → "Start scanning a string"
157+
Character: A → "Continue scanning the string"
158+
Character: l → "Continue scanning the string"
159+
Character: i → "Continue scanning the string"
160+
Character: c → "Continue scanning the string"
161+
Character: e → "Continue scanning the string"
162+
Character: " → "End the string"
163+
Character: } → "End the object"
164+
165+
The parser builds a list of "tokens", the basic building blocks of the JSON, which becomes
166+
1. BEGIN_OBJECT_TOKEN ({)
167+
2. STRING_TOKEN ("name")
168+
3. KEY_SEPARATOR_TOKEN (:)
169+
4. STRING_TOKEN ("Alice")
170+
5. END_OBJECT_TOKEN (})
171+
172+
The parser converts tokens into structured entries with parent-child relationships.
173+
Each entry knows:
174+
What type it is (object, string, number, etc.)
175+
Who its parent is
176+
How many children it has
177+
Where it is in the original JSON
178+
179+
Finally, the parser sorts entries by their key hashes for fast lookups.
180+
Original order: [{"name": "Alice"}, {"age": 30}]
181+
Sorted order: [{"age": 30}, {"name": "Alice"}]
145182

146183
# Acknowledgements
147184

src/_comparison_tools/lt.nr

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,6 @@ pub fn lt_field_16_bit(x: Field, y: Field) -> bool {
5454
predicate
5555
}
5656

57-
pub fn lt_field_8_bit(x: Field, y: Field) -> bool {
58-
// Safety: check the comments below
59-
let predicate = unsafe { get_lt_predicate_f(x, y) };
60-
let delta = y as Field - x as Field;
61-
let lt_parameter = 2 * (predicate as Field) * delta - predicate as Field - delta;
62-
// checks that the bit length of lt_parameter is 8
63-
// i.e. checks the sign of lt_parameter
64-
lt_parameter.assert_max_bit_size::<8>();
65-
66-
predicate
67-
}
68-
6957
pub fn assert_gt_240_bit(lhs: Field, rhs: Field) {
7058
// lhs > rhs
7159
// -> lhs - rhs > 0

src/_table_generation/table_generation.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,6 @@ LITERAL_CAPTURE_INCREASE_LENGTH: True for t,r,u,e,f,a,l,s,n
6666
GRAMMAR_CAPTURE_ERROR_FLAG
6767
STRING_CAPTURE_ERROR_FLAG
6868
NUMERIC_CAPTURE_ERROR_FLAG
69-
LITERAL_CAPTURE_ERROR_FLAG
69+
LITERAL_CAPTURE_ERROR_FLAG
70+
71+
PROCESS_RAW_TRANSCRIPT_TABLE: This table is used to post-process the raw transcript and add missing grammar tokens that were not captured during the initial scanning in build_transcript. Input: encoded_ascii of the last token in each entry (scan_mode + ascii character). Output: containing: token: The token type for this entry, new_grammar: Whether to add a missing grammar token, and scan_token: The type of grammar token to add (if needed), such as END_OBJECT_TOKEN }, or VALUE_SEPARATOR_TOKEN comma.

src/json.nr

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -540,30 +540,36 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
540540
unconstrained fn __build_transcript(self) -> [Field; MaxNumTokens] {
541541
let mut raw_transcript: [Field; MaxNumTokens] = [0; MaxNumTokens];
542542
let mut transcript_ptr: u32 = 0;
543+
// We start in grammar capture mode, expecting to see a { or [.
543544
let mut scan_mode = GRAMMAR_CAPTURE;
544545
let mut length: Field = 0;
545546
let mut previous_was_potential_escape_sequence = 0;
546547
for i in 0..NumBytes {
547-
// while this assert is in an unconstrained function, the out of bounds accesss `raw_transcript[transcript_ptr]` in build_transcript also generates failing constraints
548+
// while this assert is in an unconstrained function, the out of bounds access `raw_transcript[transcript_ptr]` in build_transcript also generates failing constraints
548549
assert(transcript_ptr < MaxNumTokens, "build_transcript: MaxNumTokens limit exceeded!");
549550
let ascii = self.json[i];
550551

551552
let encoded_ascii =
552553
previous_was_potential_escape_sequence * 1024 + scan_mode * 256 + ascii as Field;
553554
let ScanData { scan_token, push_transcript, increase_length, is_potential_escape_sequence } =
554555
ScanData::from_field(JSON_CAPTURE_TABLE[cast_num_to_u32(encoded_ascii)]);
556+
// increase_length and push_transcript are contradictory
557+
// increase_length = true means "extend the current token"
558+
// push_transcript = true means "start a new token"
555559
let mut push_transcript = push_transcript;
556560
let mut scan_token = scan_token;
557561
let mut increase_length = increase_length;
558562

559563
if push_transcript == 1 {
560564
let new_entry = RawTranscriptEntry::to_field(
565+
// index is where the token starts in the original JSON
561566
RawTranscriptEntry { encoded_ascii, index: i as Field - length, length },
562567
);
563568

564569
raw_transcript[transcript_ptr] = new_entry;
565570
transcript_ptr += 1;
566-
length = increase_length;
571+
// reset length to 0 as we're starting a new token
572+
length = 0;
567573
} else {
568574
length += increase_length;
569575
}
@@ -616,7 +622,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
616622

617623
// Safety: check the comments below
618624
let raw_transcript = unsafe { self.__build_transcript() };
619-
625+
626+
// steps to verify the transcript is correct
620627
// 14 gates per iteration, plus fixed cost for initing 2,048 size lookup table (4,096 gates)
621628
let mut previous_was_potential_escape_sequence = 0;
622629
for i in 0..NumBytes {
@@ -659,10 +666,10 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
659666
scan_mode = scan_token;
660667
}
661668

662-
// we encode error flag into the scan_token value, which must be less than 4
669+
// we encode error flag into the scan_token value, which must be less than 4 (object, array, string, literal)
663670
// the lookup into JSON_CAPTURE_TABLE applies an implicit 2-bit range check on `scan_token`
664671
// however this does not get triggered if the final byte scanned produces an error state
665-
length.assert_max_bit_size::<2>();
672+
scan_mode.assert_max_bit_size::<2>();
666673

667674
JSON {
668675
json: self.json,
@@ -689,6 +696,7 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
689696
let mut transcript_ptr: u32 = 0;
690697
// TODO: do we need a null transcript value?!?!
691698
for i in 0..MaxNumTokens {
699+
// encoded_ascii is the encoded scan_mode + ascii character of the LAST token in the entry
692700
let RawTranscriptEntry { encoded_ascii, index, length } =
693701
RawTranscriptEntry::from_field(self.raw_transcript[i]);
694702

@@ -722,7 +730,8 @@ impl<let NumBytes: u32, let NumPackedFields: u32, let MaxNumTokens: u32, let Max
722730
* @brief Check for missing tokens that we could have missed in `build_transcript`
723731
* @details If we had a json string where a NUMERIC_TOKEN or LITERAL_TOKEN is directly succeeded by a VALUE_SEPARATOR_TOKEN, END_OBJECT_TOKEN, END_ARRAY_TOKEN,
724732
* we will have missed the latter token.
725-
* We pick these up via the lookup table PROCESS_RAW_TRANSCRIPT_TABLE
733+
* We pick these up via the lookup table PROCESS_RAW_TRANSCRIPT_TABLE.
734+
* The entries in self.raw_transcript currently look like false}, true], null, where the grammar tokens are counted as part of the token.
726735
**/
727736
fn capture_missing_tokens(&mut self) {
728737
let mut transcript_ptr: Field = 0;

0 commit comments

Comments
 (0)