Skip to content

Commit 12db7e8

Browse files
chore(parser): optimize the most common lexer matches into lookup tables (#814)
* Use lookup table for common ascii values * Remove unecessary lifetime limit --------- Co-authored-by: Renée <[email protected]>
1 parent e49b34d commit 12db7e8

File tree

3 files changed

+129
-91
lines changed

3 files changed

+129
-91
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
use crate::TokenKind;
2+
3+
static PUNCTUATION_CHARS: [Option<TokenKind>; 256] = punctuation_lut();
4+
static NAMESTART_CHARS: [bool; 256] = namestart_lut();
5+
6+
#[inline]
7+
pub(crate) fn punctuation_kind(c: char) -> Option<TokenKind> {
8+
if c.is_ascii() {
9+
PUNCTUATION_CHARS[c as usize]
10+
} else {
11+
None
12+
}
13+
}
14+
15+
#[inline]
16+
pub(crate) fn is_namestart(c: char) -> bool {
17+
c.is_ascii() && NAMESTART_CHARS[c as usize]
18+
}
19+
20+
const fn punctuation_lut() -> [Option<TokenKind>; 256] {
21+
let mut lut = [None; 256];
22+
lut[b'{' as usize] = Some(TokenKind::LCurly);
23+
lut[b'}' as usize] = Some(TokenKind::RCurly);
24+
lut[b'!' as usize] = Some(TokenKind::Bang);
25+
lut[b'$' as usize] = Some(TokenKind::Dollar);
26+
lut[b'&' as usize] = Some(TokenKind::Amp);
27+
lut[b'(' as usize] = Some(TokenKind::LParen);
28+
lut[b')' as usize] = Some(TokenKind::RParen);
29+
lut[b':' as usize] = Some(TokenKind::Colon);
30+
lut[b',' as usize] = Some(TokenKind::Comma);
31+
lut[b'[' as usize] = Some(TokenKind::LBracket);
32+
lut[b']' as usize] = Some(TokenKind::RBracket);
33+
lut[b'=' as usize] = Some(TokenKind::Eq);
34+
lut[b'@' as usize] = Some(TokenKind::At);
35+
lut[b'|' as usize] = Some(TokenKind::Pipe);
36+
37+
lut
38+
}
39+
40+
/// <https://spec.graphql.org/October2021/#NameStart>
41+
const fn namestart_lut() -> [bool; 256] {
42+
let mut lut = [false; 256];
43+
lut[b'a' as usize] = true;
44+
lut[b'b' as usize] = true;
45+
lut[b'c' as usize] = true;
46+
lut[b'd' as usize] = true;
47+
lut[b'e' as usize] = true;
48+
lut[b'f' as usize] = true;
49+
lut[b'g' as usize] = true;
50+
lut[b'h' as usize] = true;
51+
lut[b'i' as usize] = true;
52+
lut[b'j' as usize] = true;
53+
lut[b'k' as usize] = true;
54+
lut[b'l' as usize] = true;
55+
lut[b'm' as usize] = true;
56+
lut[b'n' as usize] = true;
57+
lut[b'o' as usize] = true;
58+
lut[b'p' as usize] = true;
59+
lut[b'q' as usize] = true;
60+
lut[b'r' as usize] = true;
61+
lut[b's' as usize] = true;
62+
lut[b't' as usize] = true;
63+
lut[b'u' as usize] = true;
64+
lut[b'v' as usize] = true;
65+
lut[b'w' as usize] = true;
66+
lut[b'x' as usize] = true;
67+
lut[b'y' as usize] = true;
68+
lut[b'z' as usize] = true;
69+
70+
lut[b'A' as usize] = true;
71+
lut[b'B' as usize] = true;
72+
lut[b'C' as usize] = true;
73+
lut[b'D' as usize] = true;
74+
lut[b'E' as usize] = true;
75+
lut[b'F' as usize] = true;
76+
lut[b'G' as usize] = true;
77+
lut[b'H' as usize] = true;
78+
lut[b'I' as usize] = true;
79+
lut[b'J' as usize] = true;
80+
lut[b'K' as usize] = true;
81+
lut[b'L' as usize] = true;
82+
lut[b'M' as usize] = true;
83+
lut[b'N' as usize] = true;
84+
lut[b'O' as usize] = true;
85+
lut[b'P' as usize] = true;
86+
lut[b'Q' as usize] = true;
87+
lut[b'R' as usize] = true;
88+
lut[b'S' as usize] = true;
89+
lut[b'T' as usize] = true;
90+
lut[b'U' as usize] = true;
91+
lut[b'V' as usize] = true;
92+
lut[b'W' as usize] = true;
93+
lut[b'X' as usize] = true;
94+
lut[b'Y' as usize] = true;
95+
lut[b'Z' as usize] = true;
96+
97+
lut[b'_' as usize] = true;
98+
99+
lut
100+
}

crates/apollo-parser/src/lexer/mod.rs

Lines changed: 28 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod cursor;
2+
mod lookup;
23
mod token;
34
mod token_kind;
45

@@ -146,6 +147,26 @@ impl<'a> Cursor<'a> {
146147
};
147148
match state {
148149
State::Start => {
150+
if let Some(t) = lookup::punctuation_kind(c) {
151+
token.kind = t;
152+
token.data = self.current_str();
153+
return Ok(token);
154+
}
155+
156+
if lookup::is_namestart(c) {
157+
token.kind = TokenKind::Name;
158+
state = State::Ident;
159+
160+
continue;
161+
}
162+
163+
if c != '0' && c.is_ascii_digit() {
164+
token.kind = TokenKind::Int;
165+
state = State::IntegerPart;
166+
167+
continue;
168+
}
169+
149170
match c {
150171
'"' => {
151172
token.kind = TokenKind::StringValue;
@@ -159,14 +180,6 @@ impl<'a> Cursor<'a> {
159180
token.kind = TokenKind::Spread;
160181
state = State::SpreadOperator;
161182
}
162-
c if is_whitespace_assimilated(c) => {
163-
token.kind = TokenKind::Whitespace;
164-
state = State::Whitespace;
165-
}
166-
c if is_name_start(c) => {
167-
token.kind = TokenKind::Name;
168-
state = State::Ident;
169-
}
170183
'-' => {
171184
token.kind = TokenKind::Int;
172185
state = State::MinusSign;
@@ -175,79 +188,9 @@ impl<'a> Cursor<'a> {
175188
token.kind = TokenKind::Int;
176189
state = State::LeadingZero;
177190
}
178-
c if c.is_ascii_digit() => {
179-
token.kind = TokenKind::Int;
180-
state = State::IntegerPart;
181-
}
182-
'!' => {
183-
token.kind = TokenKind::Bang;
184-
token.data = self.current_str();
185-
return Ok(token);
186-
}
187-
'$' => {
188-
token.kind = TokenKind::Dollar;
189-
token.data = self.current_str();
190-
return Ok(token);
191-
}
192-
'&' => {
193-
token.kind = TokenKind::Amp;
194-
token.data = self.current_str();
195-
return Ok(token);
196-
}
197-
'(' => {
198-
token.kind = TokenKind::LParen;
199-
token.data = self.current_str();
200-
return Ok(token);
201-
}
202-
')' => {
203-
token.kind = TokenKind::RParen;
204-
token.data = self.current_str();
205-
return Ok(token);
206-
}
207-
':' => {
208-
token.kind = TokenKind::Colon;
209-
token.data = self.current_str();
210-
return Ok(token);
211-
}
212-
',' => {
213-
token.kind = TokenKind::Comma;
214-
token.data = self.current_str();
215-
return Ok(token);
216-
}
217-
'=' => {
218-
token.kind = TokenKind::Eq;
219-
token.data = self.current_str();
220-
return Ok(token);
221-
}
222-
'@' => {
223-
token.kind = TokenKind::At;
224-
token.data = self.current_str();
225-
return Ok(token);
226-
}
227-
'[' => {
228-
token.kind = TokenKind::LBracket;
229-
token.data = self.current_str();
230-
return Ok(token);
231-
}
232-
']' => {
233-
token.kind = TokenKind::RBracket;
234-
token.data = self.current_str();
235-
return Ok(token);
236-
}
237-
'{' => {
238-
token.kind = TokenKind::LCurly;
239-
token.data = self.current_str();
240-
return Ok(token);
241-
}
242-
'|' => {
243-
token.kind = TokenKind::Pipe;
244-
token.data = self.current_str();
245-
return Ok(token);
246-
}
247-
'}' => {
248-
token.kind = TokenKind::RCurly;
249-
token.data = self.current_str();
250-
return Ok(token);
191+
c if is_whitespace_assimilated(c) => {
192+
token.kind = TokenKind::Whitespace;
193+
state = State::Whitespace;
251194
}
252195
c => {
253196
return Err(Error::new(
@@ -412,7 +355,7 @@ impl<'a> Cursor<'a> {
412355
self.current_str().to_string(),
413356
));
414357
}
415-
_ if is_name_start(c) => {
358+
_ if lookup::is_namestart(c) => {
416359
return Err(Error::new(
417360
format!("Unexpected character `{c}` as integer suffix"),
418361
self.current_str().to_string(),
@@ -433,7 +376,7 @@ impl<'a> Cursor<'a> {
433376
token.kind = TokenKind::Float;
434377
state = State::ExponentIndicator;
435378
}
436-
_ if is_name_start(c) => {
379+
_ if lookup::is_namestart(c) => {
437380
return Err(Error::new(
438381
format!("Unexpected character `{c}` as integer suffix"),
439382
self.current_str().to_string(),
@@ -460,7 +403,7 @@ impl<'a> Cursor<'a> {
460403
'e' | 'E' => {
461404
state = State::ExponentIndicator;
462405
}
463-
_ if c == '.' || is_name_start(c) => {
406+
_ if c == '.' || lookup::is_namestart(c) => {
464407
return Err(Error::new(
465408
format!("Unexpected character `{c}` as float suffix"),
466409
self.current_str().to_string(),
@@ -500,7 +443,7 @@ impl<'a> Cursor<'a> {
500443
_ if c.is_ascii_digit() => {
501444
state = State::ExponentDigit;
502445
}
503-
_ if c == '.' || is_name_start(c) => {
446+
_ if c == '.' || lookup::is_namestart(c) => {
504447
return Err(Error::new(
505448
format!("Unexpected character `{c}` as float suffix"),
506449
self.current_str().to_string(),
@@ -641,11 +584,6 @@ fn is_whitespace_assimilated(c: char) -> bool {
641584
)
642585
}
643586

644-
/// <https://spec.graphql.org/October2021/#NameStart>
645-
fn is_name_start(c: char) -> bool {
646-
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
647-
}
648-
649587
/// <https://spec.graphql.org/October2021/#NameContinue>
650588
fn is_name_continue(c: char) -> bool {
651589
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')

crates/apollo-parser/src/lexer/token.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ impl<'a> Token<'a> {
1717
}
1818

1919
/// Get a reference to the token's data.
20-
pub fn data(&self) -> &str {
20+
pub fn data(&self) -> &'a str {
2121
self.data
2222
}
2323

0 commit comments

Comments
 (0)