@@ -319,7 +319,30 @@ impl<'a> Lexer<'a> {
319
319
// {HexDigit[list]}
320
320
// HexDigit HexDigit HexDigit HexDigit
321
321
'u' if escaped => {
322
- self . scan_escaped_unicode ( & old_pos) ?;
322
+ let mut code_point = self . scan_escaped_unicode ( & old_pos) ?;
323
+ if code_point. is_high_surrogate ( ) {
324
+ let new_pos = self . position ;
325
+ let ( Some ( ( _, '\\' ) ) , Some ( ( _, 'u' ) ) ) =
326
+ ( self . next_char ( ) , self . next_char ( ) )
327
+ else {
328
+ return Err ( Spanning :: zero_width (
329
+ & old_pos,
330
+ LexerError :: UnknownEscapeSequence ( code_point. to_string ( ) ) ,
331
+ ) ) ;
332
+ } ;
333
+ let trailing_code_point = self . scan_escaped_unicode ( & new_pos) ?;
334
+ if !trailing_code_point. is_low_surrogate ( ) {
335
+ return Err ( Spanning :: zero_width (
336
+ & old_pos,
337
+ LexerError :: UnknownEscapeSequence ( code_point. to_string ( ) ) ,
338
+ ) ) ;
339
+ }
340
+ code_point =
341
+ UnicodeCodePoint :: from_surrogate_pair ( code_point, trailing_code_point) ;
342
+ }
343
+ _ = code_point
344
+ . try_into_char ( )
345
+ . map_err ( |e| Spanning :: zero_width ( & old_pos, e) ) ?;
323
346
escaped = false ;
324
347
}
325
348
c if escaped => {
@@ -419,7 +442,7 @@ impl<'a> Lexer<'a> {
419
442
fn scan_escaped_unicode (
420
443
& mut self ,
421
444
start_pos : & SourcePosition ,
422
- ) -> Result < ( ) , Spanning < LexerError > > {
445
+ ) -> Result < UnicodeCodePoint , Spanning < LexerError > > {
423
446
// EscapedUnicode ::
424
447
// {HexDigit[list]}
425
448
// HexDigit HexDigit HexDigit HexDigit
@@ -474,6 +497,13 @@ impl<'a> Lexer<'a> {
474
497
) ) ,
475
498
) ) ;
476
499
}
500
+ // `\u{10FFFF}` is max code point
501
+ if escape. len ( ) - 2 > 6 {
502
+ return Err ( Spanning :: zero_width (
503
+ start_pos,
504
+ LexerError :: UnknownEscapeSequence ( format ! ( r"\u{}" , & escape[ ..escape. len( ) ] ) ) ,
505
+ ) ) ;
506
+ }
477
507
u32:: from_str_radix ( & escape[ 1 ..escape. len ( ) - 1 ] , 16 )
478
508
} else {
479
509
if len != 4 {
@@ -491,16 +521,10 @@ impl<'a> Lexer<'a> {
491
521
)
492
522
} ) ?;
493
523
494
- // TODO: Support surrogate.
495
-
496
- char:: from_u32 ( code_point)
497
- . ok_or_else ( || {
498
- Spanning :: zero_width (
499
- start_pos,
500
- LexerError :: UnknownEscapeSequence ( format ! ( r"\u{escape}" ) ) ,
501
- )
502
- } )
503
- . map ( drop)
524
+ Ok ( UnicodeCodePoint {
525
+ code : code_point,
526
+ is_variable_width,
527
+ } )
504
528
}
505
529
506
530
fn scan_number ( & mut self ) -> LexerResult < ' a > {
@@ -725,6 +749,8 @@ impl UnicodeCodePoint {
725
749
///
726
750
/// [0]: https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs
727
751
pub ( crate ) fn from_surrogate_pair ( high : Self , low : Self ) -> Self {
752
+ debug_assert ! ( high. is_high_surrogate( ) , "`{high}` is not a high surrogate" ) ;
753
+ debug_assert ! ( low. is_low_surrogate( ) , "`{high}` is not a low surrogate" ) ;
728
754
Self {
729
755
code : 0x10000 + ( ( high. code & 0x03FF ) << 10 ) + ( low. code & 0x03FF ) ,
730
756
is_variable_width : true ,
@@ -967,6 +993,83 @@ mod test {
967
993
) ,
968
994
) ;
969
995
996
+ assert_eq ! (
997
+ tokenize_single( r#""string with unicode escape outside BMP \u{1F600}""# ) ,
998
+ Spanning :: start_end(
999
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1000
+ & SourcePosition :: new( 50 , 0 , 50 ) ,
1001
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1002
+ r#""string with unicode escape outside BMP \u{1F600}""# ,
1003
+ ) ) ) ,
1004
+ ) ,
1005
+ ) ;
1006
+
1007
+ assert_eq ! (
1008
+ tokenize_single( r#""string with minimal unicode escape \u{0}""# ) ,
1009
+ Spanning :: start_end(
1010
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1011
+ & SourcePosition :: new( 42 , 0 , 42 ) ,
1012
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1013
+ r#""string with minimal unicode escape \u{0}""# ,
1014
+ ) ) ) ,
1015
+ ) ,
1016
+ ) ;
1017
+
1018
+ assert_eq ! (
1019
+ tokenize_single( r#""string with maximal unicode escape \u{10FFFF}""# ) ,
1020
+ Spanning :: start_end(
1021
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1022
+ & SourcePosition :: new( 47 , 0 , 47 ) ,
1023
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1024
+ r#""string with maximal unicode escape \u{10FFFF}""# ,
1025
+ ) ) ) ,
1026
+ ) ,
1027
+ ) ;
1028
+
1029
+ assert_eq ! (
1030
+ tokenize_single( r#""string with maximal minimal unicode escape \u{000000}""# ) ,
1031
+ Spanning :: start_end(
1032
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1033
+ & SourcePosition :: new( 55 , 0 , 55 ) ,
1034
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1035
+ r#""string with maximal minimal unicode escape \u{000000}""# ,
1036
+ ) ) ) ,
1037
+ ) ,
1038
+ ) ;
1039
+
1040
+ assert_eq ! (
1041
+ tokenize_single( r#""string with unicode surrogate pair escape \uD83D\uDE00""# ) ,
1042
+ Spanning :: start_end(
1043
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1044
+ & SourcePosition :: new( 56 , 0 , 56 ) ,
1045
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1046
+ r#""string with unicode surrogate pair escape \uD83D\uDE00""# ,
1047
+ ) ) ) ,
1048
+ ) ,
1049
+ ) ;
1050
+
1051
+ assert_eq ! (
1052
+ tokenize_single( r#""string with minimal surrogate pair escape \uD800\uDC00""# ) ,
1053
+ Spanning :: start_end(
1054
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1055
+ & SourcePosition :: new( 56 , 0 , 56 ) ,
1056
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1057
+ r#""string with minimal surrogate pair escape \uD800\uDC00""# ,
1058
+ ) ) ) ,
1059
+ ) ,
1060
+ ) ;
1061
+
1062
+ assert_eq ! (
1063
+ tokenize_single( r#""string with maximal surrogate pair escape \uDBFF\uDFFF""# ) ,
1064
+ Spanning :: start_end(
1065
+ & SourcePosition :: new( 0 , 0 , 0 ) ,
1066
+ & SourcePosition :: new( 56 , 0 , 56 ) ,
1067
+ Token :: Scalar ( ScalarToken :: String ( Quoted (
1068
+ r#""string with maximal surrogate pair escape \uDBFF\uDFFF""# ,
1069
+ ) ) ) ,
1070
+ ) ,
1071
+ ) ;
1072
+
970
1073
assert_eq ! (
971
1074
tokenize_single( "\" contains unescaped \u{0007} control char\" " ) ,
972
1075
Spanning :: start_end(
@@ -1089,18 +1192,98 @@ mod test {
1089
1192
) ;
1090
1193
1091
1194
assert_eq ! (
1092
- tokenize_error( r#""bad \u{DEAD } esc""# ) ,
1195
+ tokenize_error( r#""bad \u{FXXX } esc""# ) ,
1093
1196
Spanning :: zero_width(
1094
1197
& SourcePosition :: new( 6 , 0 , 6 ) ,
1095
- LexerError :: UnknownEscapeSequence ( r"\u{DEAD}" . into( ) ) ,
1198
+ LexerError :: UnknownEscapeSequence ( r"\u{FXXX}" . into( ) ) ,
1199
+ ) ,
1200
+ ) ;
1201
+
1202
+ assert_eq ! (
1203
+ tokenize_error( r#""bad \u{FFFF esc""# ) ,
1204
+ Spanning :: zero_width(
1205
+ & SourcePosition :: new( 6 , 0 , 6 ) ,
1206
+ LexerError :: UnknownEscapeSequence ( r"\u{FFFF" . into( ) ) ,
1207
+ ) ,
1208
+ ) ;
1209
+
1210
+ assert_eq ! (
1211
+ tokenize_error( r#""bad \u{FFF esc""# ) ,
1212
+ Spanning :: zero_width(
1213
+ & SourcePosition :: new( 6 , 0 , 6 ) ,
1214
+ LexerError :: UnknownEscapeSequence ( r"\u{FFF" . into( ) ) ,
1215
+ ) ,
1216
+ ) ;
1217
+
1218
+ assert_eq ! (
1219
+ tokenize_error( r#""bad \u{FFFF""# ) ,
1220
+ Spanning :: zero_width(
1221
+ & SourcePosition :: new( 6 , 0 , 6 ) ,
1222
+ LexerError :: UnknownEscapeSequence ( r"\u{FFFF" . into( ) ) ,
1096
1223
) ,
1097
1224
) ;
1098
1225
1099
1226
assert_eq ! (
1100
- tokenize_error( r#""bad \u{DEA esc""# ) ,
1227
+ tokenize_error( r#""bad \u{} esc""# ) ,
1101
1228
Spanning :: zero_width(
1102
1229
& SourcePosition :: new( 6 , 0 , 6 ) ,
1103
- LexerError :: UnknownEscapeSequence ( r"\u{DEA" . into( ) ) ,
1230
+ LexerError :: UnknownEscapeSequence ( r"\u{}" . into( ) ) ,
1231
+ ) ,
1232
+ ) ;
1233
+
1234
+ assert_eq ! (
1235
+ tokenize_error( r#""too high \u{110000} esc""# ) ,
1236
+ Spanning :: zero_width(
1237
+ & SourcePosition :: new( 11 , 0 , 11 ) ,
1238
+ LexerError :: UnknownEscapeSequence ( r"\u{110000}" . into( ) ) ,
1239
+ ) ,
1240
+ ) ;
1241
+
1242
+ assert_eq ! (
1243
+ tokenize_error( r#""way too high \u{12345678} esc""# ) ,
1244
+ Spanning :: zero_width(
1245
+ & SourcePosition :: new( 15 , 0 , 15 ) ,
1246
+ LexerError :: UnknownEscapeSequence ( r"\u{12345678}" . into( ) ) ,
1247
+ ) ,
1248
+ ) ;
1249
+
1250
+ assert_eq ! (
1251
+ tokenize_error( r#""too long \u{000000000} esc""# ) ,
1252
+ Spanning :: zero_width(
1253
+ & SourcePosition :: new( 11 , 0 , 11 ) ,
1254
+ LexerError :: UnknownEscapeSequence ( r"\u{000000000}" . into( ) ) ,
1255
+ ) ,
1256
+ ) ;
1257
+
1258
+ assert_eq ! (
1259
+ tokenize_error( r#""bad surrogate \uDEAD esc""# ) ,
1260
+ Spanning :: zero_width(
1261
+ & SourcePosition :: new( 16 , 0 , 16 ) ,
1262
+ LexerError :: UnknownEscapeSequence ( r"\uDEAD" . into( ) ) ,
1263
+ ) ,
1264
+ ) ;
1265
+
1266
+ assert_eq ! (
1267
+ tokenize_error( r#""bad surrogate \u{DEAD} esc""# ) ,
1268
+ Spanning :: zero_width(
1269
+ & SourcePosition :: new( 16 , 0 , 16 ) ,
1270
+ LexerError :: UnknownEscapeSequence ( r"\u{DEAD}" . into( ) ) ,
1271
+ ) ,
1272
+ ) ;
1273
+
1274
+ assert_eq ! (
1275
+ tokenize_error( r#""bad high surrogate pair \uDEAD\uDEAD esc""# ) ,
1276
+ Spanning :: zero_width(
1277
+ & SourcePosition :: new( 26 , 0 , 26 ) ,
1278
+ LexerError :: UnknownEscapeSequence ( r"\uDEAD" . into( ) ) ,
1279
+ ) ,
1280
+ ) ;
1281
+
1282
+ assert_eq ! (
1283
+ tokenize_error( r#""bad low surrogate pair \uD800\uD800 esc""# ) ,
1284
+ Spanning :: zero_width(
1285
+ & SourcePosition :: new( 25 , 0 , 25 ) ,
1286
+ LexerError :: UnknownEscapeSequence ( r"\uD800" . into( ) ) ,
1104
1287
) ,
1105
1288
) ;
1106
1289
0 commit comments