Skip to content

Commit 8dcf599

Browse files
optimize lexer with memchr and byte-level whitespace parsing (#290)
1 parent 7b15fd9 commit 8dcf599

File tree

4 files changed

+58
-40
lines changed

4 files changed

+58
-40
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ clap = { version = "4.5", features = ["derive"] }
2424
config = { version ="0.15", features = ["toml"] }
2525
dashmap = "6.1"
2626
directories = "6.0"
27+
memchr = "2.7"
2728
notify = "8.2"
2829
percent-encoding = "2.3"
2930
rustc-hash = "2.1"

crates/djls-templates/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ djls-source = { workspace = true }
99
djls-workspace = { workspace = true }
1010

1111
anyhow = { workspace = true }
12+
memchr = { workspace = true }
1213
salsa = { workspace = true }
1314
serde = { workspace = true }
1415
thiserror = { workspace = true }

crates/djls-templates/src/lexer.rs

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
use djls_source::Span;
2+
use memchr::memchr3;
3+
use memchr::memmem;
24

35
use crate::tokens::TagDelimiter;
46
use crate::tokens::Token;
@@ -68,7 +70,7 @@ impl Lexer {
6870

6971
self.consume_n(TagDelimiter::LENGTH);
7072

71-
match self.consume_until(delimiter.closer()) {
73+
match self.consume_until_delimiter(delimiter.closer()) {
7274
Ok(text) => {
7375
let len = text.len();
7476
let span = Span::saturating_from_parts_usize(content_start, len);
@@ -91,41 +93,38 @@ impl Lexer {
9193
}
9294

9395
fn lex_whitespace(&mut self, c: char) -> Token {
96+
self.consume();
97+
9498
if c == '\n' || c == '\r' {
95-
self.consume(); // \r or \n
9699
if c == '\r' && self.peek() == '\n' {
97-
self.consume(); // \n of \r\n
98-
}
99-
let span = Span::saturating_from_bounds_usize(self.start, self.current);
100-
Token::Newline { span }
101-
} else {
102-
self.consume(); // Consume the first whitespace
103-
while !self.is_at_end() && self.peek().is_whitespace() {
104-
if self.peek() == '\n' || self.peek() == '\r' {
105-
break;
106-
}
107100
self.consume();
108101
}
109102
let span = Span::saturating_from_bounds_usize(self.start, self.current);
110-
Token::Whitespace { span }
103+
return Token::Newline { span };
111104
}
112-
}
113-
114-
fn lex_text(&mut self) -> Token {
115-
let text_start = self.current;
116105

117106
while !self.is_at_end() {
118-
let remaining = self.remaining_source();
119-
if (self.peek() == TagDelimiter::CHAR_OPEN
120-
&& TagDelimiter::from_input(remaining).is_some())
121-
|| remaining.starts_with('\n')
122-
|| remaining.starts_with('\r')
123-
{
124-
break;
107+
let remaining = self.remaining_source().as_bytes();
108+
109+
match remaining.first() {
110+
Some(&b'\n' | &b'\r') | None => break,
111+
Some(&b' ' | &b'\t') => self.current += 1,
112+
Some(_) => {
113+
if !self.peek().is_whitespace() {
114+
break;
115+
}
116+
self.consume();
117+
}
125118
}
126-
self.consume();
127119
}
128120

121+
let span = Span::saturating_from_bounds_usize(self.start, self.current);
122+
Token::Whitespace { span }
123+
}
124+
125+
fn lex_text(&mut self) -> Token {
126+
let text_start = self.current;
127+
self.current += self.consume_until_stop_char();
129128
let text = self.consumed_source_from(text_start);
130129
let span = Span::saturating_from_bounds_usize(self.start, self.current);
131130
Token::Text {
@@ -167,29 +166,45 @@ impl Lexer {
167166
}
168167
}
169168

170-
fn consume_until(&mut self, delimiter: &str) -> Result<String, String> {
169+
fn consume_until_delimiter(&mut self, delimiter: &str) -> Result<String, String> {
171170
let offset = self.current;
172-
let mut fallback: Option<usize> = None;
173171

174-
while self.current < self.source.len() {
175-
let remaining = self.remaining_source();
172+
if let Some(pos) = memmem::find(self.remaining_source().as_bytes(), delimiter.as_bytes()) {
173+
self.current += pos;
174+
return Ok(self.consumed_source_from(offset).to_string());
175+
}
176176

177-
if remaining.starts_with(delimiter) {
178-
return Ok(self.consumed_source_from(offset).to_string());
179-
}
177+
self.current += self.consume_until_stop_char();
178+
Err(self.consumed_source_from(offset).to_string())
179+
}
180+
181+
fn consume_until_stop_char(&self) -> usize {
182+
let mut offset = 0;
183+
let max = self.source.len() - self.current;
180184

181-
if fallback.is_none() {
182-
let ch = self.peek();
183-
if TagDelimiter::from_input(remaining).is_some() || matches!(ch, '\n' | '\r') {
184-
fallback = Some(self.current);
185+
while offset < max {
186+
let remaining = &self.remaining_source()[offset..];
187+
188+
match memchr3(b'{', b'\n', b'\r', remaining.as_bytes()) {
189+
None => {
190+
offset = max;
191+
break;
185192
}
186-
}
193+
Some(pos) => {
194+
let is_newline = matches!(remaining.as_bytes()[pos], b'\n' | b'\r');
195+
let is_django_delimiter = TagDelimiter::from_input(&remaining[pos..]).is_some();
187196

188-
self.consume();
197+
if is_newline || is_django_delimiter {
198+
offset += pos;
199+
break;
200+
}
201+
202+
offset += pos + 1;
203+
}
204+
}
189205
}
190206

191-
self.current = fallback.unwrap_or(self.current);
192-
Err(self.consumed_source_from(offset).to_string())
207+
offset
193208
}
194209
}
195210

0 commit comments

Comments
 (0)