@@ -79,7 +79,7 @@ impl Lexer {
7979 Ok ( tokens)
8080 }
8181
82- /// Get a string slice cut but the scanner and return the coreesponding token(s)
82+ /// Get a string slice cut by the scanner and return the corresponding token(s)
8383 fn tokenize ( slice : & str ) -> Result < Vec < Token > , LexerError > {
8484 let mut starting_chars = slice. trim_matches ( ' ' ) . chars ( ) . take ( 2 ) ;
8585 return match ( starting_chars. next ( ) , starting_chars. next ( ) ) {
@@ -88,48 +88,55 @@ impl Lexer {
8888 '{' | '}' | '\\' => {
8989 // Handle escaped chars
9090 let tail = slice. get ( 1 ..) . unwrap_or ( "" ) ;
91- return Ok ( vec ! [ Token :: PlainText ( tail) ] ) ; // No recursive tokenize here, juste some plain text because the char is escaped
91+ Ok ( vec ! [ Token :: PlainText ( tail) ] ) // Escaped single char -> plain text
9292 }
9393 '\'' => {
94- // Escaped unicode in hex value : \'f0
94+ // Escaped unicode hex value: \'f0
9595 let tail = slice. get ( 1 ..) . unwrap_or ( "" ) ;
9696 if tail. len ( ) < 2 {
9797 return Err ( LexerError :: InvalidUnicode ( tail. into ( ) ) ) ;
9898 }
99- let byte = u8:: from_str_radix ( & tail[ 1 ..3 ] , 16 ) ?; // f0
99+ let byte = u8:: from_str_radix ( & tail[ 1 ..3 ] , 16 ) ?;
100100 let mut ret = vec ! [ Token :: ControlSymbol ( ( ControlWord :: Unicode , Property :: Value ( byte as i32 ) ) ) ] ;
101101 recursive_tokenize ! ( & tail[ 3 ..] , ret) ;
102- return Ok ( ret) ;
102+ Ok ( ret)
103103 }
104104 '\n' => {
105105 // CRLF
106106 let mut ret = vec ! [ Token :: CRLF ] ;
107107 if let Some ( tail) = slice. get ( 2 ..) {
108108 recursive_tokenize ! ( tail, ret) ;
109109 }
110- return Ok ( ret) ;
110+ Ok ( ret)
111111 }
112112 'a' ..='z' => {
113113 // Identify control word
114- // ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
115114 let ( mut ident, tail) = slice. split_first_whitespace ( ) ;
116- // if ident end with semicolon, strip it for correct value parsing
117- ident = if ident. chars ( ) . last ( ) . unwrap_or ( ' ' ) == ';' { & ident[ 0 ..ident. len ( ) - 1 ] } else { ident } ;
118- let control_word = ControlWord :: from ( ident) ?;
115+ ident = if ident. ends_with ( ';' ) { & ident[ ..ident. len ( ) - 1 ] } else { ident } ;
116+
117+ // Try parse control word, fallback for symbols like "-" in \pntext
118+ let control_word = match ControlWord :: from ( ident) {
119+ Ok ( cw) => cw,
120+ Err ( _) => {
121+ // Treat as plain text if it cannot be parsed as control word
122+ return Ok ( vec ! [ Token :: PlainText ( slice) ] ) ;
123+ }
124+ } ;
125+
119126 let mut ret = vec ! [ Token :: ControlSymbol ( control_word) ] ;
120127 recursive_tokenize ! ( tail, ret) ;
121-
122- // \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on
123- // \u1234 1 -> No need to walk in here, it will enter plain text
124- if control_word. 0 == ControlWord :: Unicode && tail. len ( ) > 0 && tail. trim ( ) == "" {
128+
129+ // Handle special case for \u1234 and trailing spaces
130+ if control_word. 0 == ControlWord :: Unicode && !tail. trim ( ) . is_empty ( ) && tail. trim ( ) . chars ( ) . all ( |ch| ch. is_whitespace ( ) ) {
125131 ret. push ( Token :: PlainText ( tail) ) ;
126132 }
127- return Ok ( ret) ;
133+
134+ Ok ( ret)
128135 }
129136 '*' => Ok ( vec ! [ Token :: IgnorableDestination ] ) ,
130137 _ => Ok ( vec ! [ ] ) ,
131138 } ,
132- ( Some ( '\n' ) , Some ( _) ) => recursive_tokenize ! ( & slice[ 1 ..] ) , // Ignore the CRLF if it's not escaped
139+ ( Some ( '\n' ) , Some ( _) ) => recursive_tokenize ! ( & slice[ 1 ..] ) , // Ignore CRLF if not escaped
133140 // Handle brackets
134141 ( Some ( '{' ) , None ) => Ok ( vec ! [ Token :: OpeningBracket ] ) ,
135142 ( Some ( '}' ) , None ) => Ok ( vec ! [ Token :: ClosingBracket ] ) ,
@@ -139,10 +146,11 @@ impl Lexer {
139146 // Else, it's plain text
140147 _ => {
141148 let text = slice. trim ( ) ;
142- if text == "" {
143- return Ok ( vec ! [ ] ) ;
149+ if text. is_empty ( ) {
150+ Ok ( vec ! [ ] )
151+ } else {
152+ Ok ( vec ! [ Token :: PlainText ( slice) ] )
144153 }
145- return Ok ( vec ! [ Token :: PlainText ( slice) ] ) ;
146154 }
147155 } ;
148156 }
0 commit comments