Skip to content

Commit 36ae2f5

Browse files
Refactor lexer to use direct string operations (#219)
1 parent 472a040 commit 36ae2f5

13 files changed

+274
-523
lines changed

crates/djls-templates/src/ast.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ impl Span {
153153

154154
#[must_use]
155155
pub fn from_token(token: &Token<'_>, db: &dyn TemplateDb) -> Self {
156-
let start = token.start().unwrap_or(0);
156+
let start = token.offset().unwrap_or(0);
157157
let length = token.length(db);
158158
Span::new(start, length)
159159
}

crates/djls-templates/src/lexer.rs

Lines changed: 57 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,16 @@ use crate::db::Db as TemplateDb;
22
use crate::tokens::Token;
33
use crate::tokens::TokenContent;
44

5+
const BLOCK_TAG_START: &str = "{%";
6+
const BLOCK_TAG_END: &str = "%}";
7+
const VARIABLE_TAG_START: &str = "{{";
8+
const VARIABLE_TAG_END: &str = "}}";
9+
const COMMENT_TAG_START: &str = "{#";
10+
const COMMENT_TAG_END: &str = "#}";
11+
512
pub struct Lexer<'db> {
613
db: &'db dyn TemplateDb,
714
source: String,
8-
chars: Vec<char>,
915
start: usize,
1016
current: usize,
1117
line: usize,
@@ -17,7 +23,6 @@ impl<'db> Lexer<'db> {
1723
Lexer {
1824
db,
1925
source: String::from(source),
20-
chars: source.chars().collect(),
2126
start: 0,
2227
current: 0,
2328
line: 1,
@@ -32,22 +37,14 @@ impl<'db> Lexer<'db> {
3237

3338
let token = match self.peek() {
3439
'{' => match self.peek_next() {
35-
'%' => self.lex_django_construct("%}", |content, line, start| Token::Block {
36-
content,
37-
line,
38-
start,
40+
'%' => self.lex_django_construct(BLOCK_TAG_END, |content, offset| {
41+
Token::Block { content, offset }
3942
}),
40-
'{' => {
41-
self.lex_django_construct("}}", |content, line, start| Token::Variable {
42-
content,
43-
line,
44-
start,
45-
})
46-
}
47-
'#' => self.lex_django_construct("#}", |content, line, start| Token::Comment {
48-
content,
49-
line,
50-
start,
43+
'{' => self.lex_django_construct(VARIABLE_TAG_END, |content, offset| {
44+
Token::Variable { content, offset }
45+
}),
46+
'#' => self.lex_django_construct(COMMENT_TAG_END, |content, offset| {
47+
Token::Comment { content, offset }
5148
}),
5249
_ => self.lex_text(),
5350
},
@@ -69,49 +66,43 @@ impl<'db> Lexer<'db> {
6966
tokens.push(token);
7067
}
7168

72-
tokens.push(Token::Eof { line: self.line });
69+
tokens.push(Token::Eof);
7370

7471
tokens
7572
}
7673

7774
fn lex_django_construct(
7875
&mut self,
7976
end: &str,
80-
token_fn: impl FnOnce(TokenContent<'db>, usize, usize) -> Token<'db>,
77+
token_fn: impl FnOnce(TokenContent<'db>, usize) -> Token<'db>,
8178
) -> Token<'db> {
82-
let line = self.line;
83-
let start = self.start + 3;
79+
let offset = self.start + 3;
8480

8581
self.consume_n(2);
8682

8783
match self.consume_until(end) {
8884
Ok(text) => {
8985
self.consume_n(2);
9086
let content = TokenContent::new(self.db, text);
91-
token_fn(content, line, start)
87+
token_fn(content, offset)
9288
}
9389
Err(err_text) => {
9490
self.synchronize();
9591
let content = TokenContent::new(self.db, err_text);
96-
Token::Error {
97-
content,
98-
line,
99-
start,
100-
}
92+
Token::Error { content, offset }
10193
}
10294
}
10395
}
10496

10597
fn lex_whitespace(&mut self, c: char) -> Token<'db> {
106-
let line = self.line;
107-
let start = self.start;
98+
let offset = self.start;
10899

109100
if c == '\n' || c == '\r' {
110101
self.consume(); // \r or \n
111102
if c == '\r' && self.peek() == '\n' {
112103
self.consume(); // \n of \r\n
113104
}
114-
Token::Newline { line, start }
105+
Token::Newline { offset }
115106
} else {
116107
self.consume(); // Consume the first whitespace
117108
while !self.is_at_end() && self.peek().is_whitespace() {
@@ -121,67 +112,64 @@ impl<'db> Lexer<'db> {
121112
self.consume();
122113
}
123114
let count = self.current - self.start;
124-
Token::Whitespace { count, line, start }
115+
Token::Whitespace { count, offset }
125116
}
126117
}
127118

128119
fn lex_text(&mut self) -> Token<'db> {
129-
let line = self.line;
130-
let start = self.start;
120+
let text_start = self.current;
131121

132-
let mut text = String::new();
133122
while !self.is_at_end() {
134-
let c = self.peek();
135-
136-
if c == '{' {
137-
let next = self.peek_next();
138-
if next == '%' || next == '{' || next == '#' {
139-
break;
140-
}
141-
} else if c == '\n' {
123+
if self.source[self.current..].starts_with(BLOCK_TAG_START)
124+
|| self.source[self.current..].starts_with(VARIABLE_TAG_START)
125+
|| self.source[self.current..].starts_with(COMMENT_TAG_START)
126+
|| self.source[self.current..].starts_with('\n')
127+
{
142128
break;
143129
}
144-
145-
text.push(c);
146130
self.consume();
147131
}
148132

149-
let content = TokenContent::new(self.db, text);
133+
let text = &self.source[text_start..self.current];
134+
let content = TokenContent::new(self.db, text.to_string());
150135
Token::Text {
151136
content,
152-
line,
153-
start,
137+
offset: self.start,
154138
}
155139
}
156140

141+
#[inline]
157142
fn peek(&self) -> char {
158-
self.peek_at(0)
143+
self.source[self.current..].chars().next().unwrap_or('\0')
159144
}
160145

161146
fn peek_next(&self) -> char {
162-
self.peek_at(1)
147+
let mut chars = self.source[self.current..].chars();
148+
chars.next(); // Skip current
149+
chars.next().unwrap_or('\0')
163150
}
164151

165152
fn peek_previous(&self) -> char {
166-
self.peek_at(-1)
167-
}
168-
169-
fn peek_at(&self, offset: isize) -> char {
170-
let Some(index) = self.current.checked_add_signed(offset) else {
153+
if self.current == 0 {
171154
return '\0';
172-
};
173-
self.chars.get(index).copied().unwrap_or('\0')
155+
}
156+
let mut pos = self.current - 1;
157+
while !self.source.is_char_boundary(pos) && pos > 0 {
158+
pos -= 1;
159+
}
160+
self.source[pos..].chars().next().unwrap_or('\0')
174161
}
175162

163+
#[inline]
176164
fn is_at_end(&self) -> bool {
177165
self.current >= self.source.len()
178166
}
179167

168+
#[inline]
180169
fn consume(&mut self) {
181-
if self.is_at_end() {
182-
return;
170+
if let Some(ch) = self.source[self.current..].chars().next() {
171+
self.current += ch.len_utf8();
183172
}
184-
self.current += 1;
185173
}
186174

187175
fn consume_n(&mut self, count: usize) {
@@ -190,25 +178,24 @@ impl<'db> Lexer<'db> {
190178
}
191179
}
192180

193-
fn consume_until(&mut self, s: &str) -> Result<String, String> {
194-
let start = self.current;
195-
while !self.is_at_end() {
196-
if self.chars[self.current..self.chars.len()]
197-
.starts_with(s.chars().collect::<Vec<_>>().as_slice())
198-
{
199-
return Ok(self.source[start..self.current].trim().to_string());
181+
fn consume_until(&mut self, delimiter: &str) -> Result<String, String> {
182+
let offset = self.current;
183+
184+
while self.current < self.source.len() {
185+
if self.source[self.current..].starts_with(delimiter) {
186+
return Ok(self.source[offset..self.current].trim().to_string());
200187
}
201188
self.consume();
202189
}
203-
Err(self.source[start..self.current].trim().to_string())
190+
191+
Err(self.source[offset..self.current].trim().to_string())
204192
}
205193

206194
fn synchronize(&mut self) {
207-
let sync_chars = &['{', '\n', '\r'];
195+
const SYNC_POINTS: &[u8] = b"{\n\r";
208196

209197
while !self.is_at_end() {
210-
let current_char = self.peek();
211-
if sync_chars.contains(&current_char) {
198+
if SYNC_POINTS.contains(&self.source.as_bytes()[self.current]) {
212199
return;
213200
}
214201
self.consume();

crates/djls-templates/src/parser.rs

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ impl<'db> Parser<'db> {
3737
let tokens = self.tokens.stream(self.db);
3838
for token in tokens {
3939
if matches!(token, Token::Newline { .. }) {
40-
let start = token.start();
41-
if let Some(start) = start {
42-
line_offsets.add_line(start + 1);
40+
let offset = token.offset();
41+
if let Some(offset) = offset {
42+
line_offsets.add_line(offset + 1);
4343
}
4444
}
4545
}
@@ -91,16 +91,13 @@ impl<'db> Parser<'db> {
9191
let token = self.peek_previous()?;
9292

9393
if let Token::Error {
94-
content,
95-
line: _,
96-
start,
97-
..
94+
content, offset, ..
9895
} = token
9996
{
10097
let error_text = content.text(self.db).clone();
10198

10299
Err(ParserError::MalformedConstruct {
103-
position: start,
100+
position: offset,
104101
content: error_text,
105102
})
106103
} else {
@@ -152,8 +149,8 @@ impl<'db> Parser<'db> {
152149
return self.next_node();
153150
}
154151

155-
let start = first_token.start().unwrap_or(0);
156-
let mut end_position = start + first_token.length(self.db);
152+
let offset = first_token.offset().unwrap_or(0);
153+
let mut end_position = offset + first_token.length(self.db);
157154

158155
while let Ok(token) = self.peek() {
159156
match token {
@@ -164,16 +161,16 @@ impl<'db> Parser<'db> {
164161
| Token::Eof { .. } => break, // Stop at Django constructs
165162
Token::Text { .. } | Token::Whitespace { .. } | Token::Newline { .. } => {
166163
// Update end position
167-
let token_start = token.start().unwrap_or(end_position);
164+
let token_offset = token.offset().unwrap_or(end_position);
168165
let token_length = token.length(self.db);
169-
end_position = token_start + token_length;
166+
end_position = token_offset + token_length;
170167
self.consume()?;
171168
}
172169
}
173170
}
174171

175-
let length = end_position - start;
176-
let span = Span::new(start, length);
172+
let length = end_position - offset;
173+
let span = Span::new(offset, length);
177174

178175
Ok(Node::Text { span })
179176
}

0 commit comments

Comments
 (0)