Skip to content

Commit 4b45af0

Browse files
authored
Update lexer::tokenize()
1 parent bb8b5e9 commit 4b45af0

File tree

1 file changed

+27
-19
lines changed

1 file changed

+27
-19
lines changed

src/lexer.rs

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ impl Lexer {
7979
Ok(tokens)
8080
}
8181

82-
/// Get a string slice cut but the scanner and return the coreesponding token(s)
82+
/// Get a string slice cut by the scanner and return the corresponding token(s)
8383
fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
8484
let mut starting_chars = slice.trim_matches(' ').chars().take(2);
8585
return match (starting_chars.next(), starting_chars.next()) {
@@ -88,48 +88,55 @@ impl Lexer {
8888
'{' | '}' | '\\' => {
8989
// Handle escaped chars
9090
let tail = slice.get(1..).unwrap_or("");
91-
return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped
91+
Ok(vec![Token::PlainText(tail)]) // Escaped single char -> plain text
9292
}
9393
'\'' => {
94-
// Escaped unicode in hex value : \'f0
94+
// Escaped unicode hex value: \'f0
9595
let tail = slice.get(1..).unwrap_or("");
9696
if tail.len() < 2 {
9797
return Err(LexerError::InvalidUnicode(tail.into()));
9898
}
99-
let byte = u8::from_str_radix(&tail[1..3], 16)?; // f0
99+
let byte = u8::from_str_radix(&tail[1..3], 16)?;
100100
let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
101101
recursive_tokenize!(&tail[3..], ret);
102-
return Ok(ret);
102+
Ok(ret)
103103
}
104104
'\n' => {
105105
// CRLF
106106
let mut ret = vec![Token::CRLF];
107107
if let Some(tail) = slice.get(2..) {
108108
recursive_tokenize!(tail, ret);
109109
}
110-
return Ok(ret);
110+
Ok(ret)
111111
}
112112
'a'..='z' => {
113113
// Identify control word
114-
// ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
115114
let (mut ident, tail) = slice.split_first_whitespace();
116-
// if ident end with semicolon, strip it for correct value parsing
117-
ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
118-
let control_word = ControlWord::from(ident)?;
115+
ident = if ident.ends_with(';') { &ident[..ident.len() - 1] } else { ident };
116+
117+
// Try parse control word, fallback for symbols like "-" in \pntext
118+
let control_word = match ControlWord::from(ident) {
119+
Ok(cw) => cw,
120+
Err(_) => {
121+
// Treat as plain text if it cannot be parsed as control word
122+
return Ok(vec![Token::PlainText(slice)]);
123+
}
124+
};
125+
119126
let mut ret = vec![Token::ControlSymbol(control_word)];
120127
recursive_tokenize!(tail, ret);
121-
122-
// \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on
123-
// \u1234 1 -> No need to walk in here, it will enter plain text
124-
if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" {
128+
129+
// Handle special case for \u1234 and trailing spaces
130+
if control_word.0 == ControlWord::Unicode && !tail.trim().is_empty() && tail.trim().chars().all(|ch| ch.is_whitespace()) {
125131
ret.push(Token::PlainText(tail));
126132
}
127-
return Ok(ret);
133+
134+
Ok(ret)
128135
}
129136
'*' => Ok(vec![Token::IgnorableDestination]),
130137
_ => Ok(vec![]),
131138
},
132-
(Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped
139+
(Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore CRLF if not escaped
133140
// Handle brackets
134141
(Some('{'), None) => Ok(vec![Token::OpeningBracket]),
135142
(Some('}'), None) => Ok(vec![Token::ClosingBracket]),
@@ -139,10 +146,11 @@ impl Lexer {
139146
// Else, it's plain text
140147
_ => {
141148
let text = slice.trim();
142-
if text == "" {
143-
return Ok(vec![]);
149+
if text.is_empty() {
150+
Ok(vec![])
151+
} else {
152+
Ok(vec![Token::PlainText(slice)])
144153
}
145-
return Ok(vec![Token::PlainText(slice)]);
146154
}
147155
};
148156
}

0 commit comments

Comments
 (0)