@@ -55,6 +55,8 @@ pub enum Token {
5555 EscapedStringLiteral ( String ) ,
5656 /// Hexadecimal string literal: i.e.: X'deadbeef'
5757 HexStringLiteral ( String ) ,
58+ /// Unicode escaped string: U&'d\0061t\+000061' (data)
59+ UnicodeEscapedStringLiteral ( String ) ,
5860 /// Comma
5961 Comma ,
6062 /// Whitespace (space, tab, etc)
@@ -156,6 +158,7 @@ impl fmt::Display for Token {
156158 Token :: NationalStringLiteral ( ref s) => write ! ( f, "N'{}'" , s) ,
157159 Token :: EscapedStringLiteral ( ref s) => write ! ( f, "E'{}'" , s) ,
158160 Token :: HexStringLiteral ( ref s) => write ! ( f, "X'{}'" , s) ,
161+ Token :: UnicodeEscapedStringLiteral ( ref s) => write ! ( f, "U&'{}'" , s) ,
159162 Token :: Comma => f. write_str ( "," ) ,
160163 Token :: Whitespace ( ws) => write ! ( f, "{}" , ws) ,
161164 Token :: DoubleEq => f. write_str ( "==" ) ,
@@ -415,6 +418,28 @@ impl<'a> Tokenizer<'a> {
415418 }
416419 }
417420 }
421+ x @ 'u' | x @ 'U' => {
422+ chars. next ( ) ; // consume, to check the next char
423+ let mut look_ahead_chars = chars. clone ( ) ;
424+ if look_ahead_chars. next_if_eq ( & '&' ) . is_some ( ) {
425+ match look_ahead_chars. peek ( ) {
426+ Some ( '\'' ) => {
427+ //Move chars to the position of look_ahead_chars
428+ chars. next ( ) ;
429+ // U&'...' - a <binary string literal>
430+ let s = self . tokenize_single_quoted_string ( chars) ?;
431+ Ok ( Some ( Token :: UnicodeEscapedStringLiteral ( s) ) )
432+ }
433+ _ => {
434+ let s = self . tokenize_word ( x, chars) ;
435+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
436+ }
437+ }
438+ } else {
439+ let s = self . tokenize_word ( x, chars) ;
440+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
441+ }
442+ }
418443 // identifier or keyword
419444 ch if self . dialect . is_identifier_start ( ch) => {
420445 chars. next ( ) ; // consume the first char
@@ -1417,4 +1442,36 @@ mod tests {
14171442 //println!("------------------------------");
14181443 assert_eq ! ( expected, actual) ;
14191444 }
1445+ #[ test]
1446+ fn tokenize_unicode_escaped_literal ( ) {
1447+ let sql = r#"U&'aaa'"# ;
1448+ let dialect = GenericDialect { } ;
1449+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1450+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1451+ let expected = vec ! [ Token :: UnicodeEscapedStringLiteral ( "aaa" . to_string( ) ) ] ;
1452+ compare ( expected, tokens) ;
1453+
1454+ let sql = r#"U&a"# ;
1455+ let dialect = GenericDialect { } ;
1456+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1457+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1458+ let expected = vec ! [
1459+ Token :: make_word( "U" , None ) ,
1460+ Token :: Ampersand ,
1461+ Token :: make_word( "a" , None ) ,
1462+ ] ;
1463+ compare ( expected, tokens) ;
1464+ let sql = r#"U & 'aaa'"# ;
1465+ let dialect = GenericDialect { } ;
1466+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
1467+ let tokens = tokenizer. tokenize ( ) . unwrap ( ) ;
1468+ let expected = vec ! [
1469+ Token :: make_word( "U" , None ) ,
1470+ Token :: Whitespace ( Whitespace :: Space ) ,
1471+ Token :: Ampersand ,
1472+ Token :: Whitespace ( Whitespace :: Space ) ,
1473+ Token :: SingleQuotedString ( "aaa" . to_string( ) ) ,
1474+ ] ;
1475+ compare ( expected, tokens) ;
1476+ }
14201477}
0 commit comments