@@ -253,7 +253,7 @@ class Scanner {
253
253
null => false ,
254
254
LF || CR || BOM => false ,
255
255
TAB || NEL => true ,
256
- _ => _isStandardCharacter (char ),
256
+ _ => _isStandardCharacterAt ( 0 ),
257
257
};
258
258
}
259
259
@@ -267,7 +267,7 @@ class Scanner {
267
267
null => false ,
268
268
LF || CR || BOM || SP => false ,
269
269
NEL => true ,
270
- _ => _isStandardCharacter (char ),
270
+ _ => _isStandardCharacterAt ( 0 ),
271
271
};
272
272
}
273
273
@@ -614,9 +614,9 @@ class Scanner {
614
614
615
615
// Consume the indicator token.
616
616
var start = _scanner.state;
617
- _scanner.readChar ();
618
- _scanner.readChar ();
619
- _scanner.readChar ();
617
+ _scanner.readCodePoint ();
618
+ _scanner.readCodePoint ();
619
+ _scanner.readCodePoint ();
620
620
621
621
_tokens.add (Token (type, _scanner.spanFrom (start)));
622
622
}
@@ -732,7 +732,7 @@ class Scanner {
732
732
/// The span of the new token is the current character.
733
733
void _addCharToken (TokenType type) {
734
734
var start = _scanner.state;
735
- _scanner.readChar ();
735
+ _scanner.readCodePoint ();
736
736
_tokens.add (Token (type, _scanner.spanFrom (start)));
737
737
}
738
738
@@ -836,7 +836,7 @@ class Scanner {
836
836
// libyaml doesn't support unknown directives, but the spec says to ignore
837
837
// them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
838
838
while (! _isBreakOrEnd) {
839
- _scanner.readChar ();
839
+ _scanner.readCodePoint ();
840
840
}
841
841
842
842
return null ;
@@ -866,7 +866,7 @@ class Scanner {
866
866
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
867
867
var start = _scanner.position;
868
868
while (_isNonSpace) {
869
- _scanner.readChar ();
869
+ _scanner.readCodePoint ();
870
870
}
871
871
872
872
var name = _scanner.substring (start);
@@ -941,13 +941,13 @@ class Scanner {
941
941
var start = _scanner.state;
942
942
943
943
// Eat the indicator character.
944
- _scanner.readChar ();
944
+ _scanner.readCodePoint ();
945
945
946
946
// libyaml only allows word characters in anchor names, but the spec
947
947
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
948
948
var startPosition = _scanner.position;
949
949
while (_isAnchorChar) {
950
- _scanner.readChar ();
950
+ _scanner.readCodePoint ();
951
951
}
952
952
var name = _scanner.substring (startPosition);
953
953
@@ -1032,7 +1032,7 @@ class Scanner {
1032
1032
buffer.write (_scanner.substring (start));
1033
1033
1034
1034
if (_scanner.peekChar () == EXCLAMATION ) {
1035
- buffer.writeCharCode (_scanner.readChar ());
1035
+ buffer.writeCharCode (_scanner.readCodePoint ());
1036
1036
} else {
1037
1037
// It's either the '!' tag or not really a tag handle. If it's a %TAG
1038
1038
// directive, it's an error. If it's a tag token, it must be part of a
@@ -1083,15 +1083,15 @@ class Scanner {
1083
1083
var start = _scanner.state;
1084
1084
1085
1085
// Eat the indicator '|' or '>'.
1086
- _scanner.readChar ();
1086
+ _scanner.readCodePoint ();
1087
1087
1088
1088
// Check for a chomping indicator.
1089
1089
var chomping = _Chomping .clip;
1090
1090
var increment = 0 ;
1091
1091
var char = _scanner.peekChar ();
1092
1092
if (char == PLUS || char == HYPHEN ) {
1093
1093
chomping = char == PLUS ? _Chomping .keep : _Chomping .strip;
1094
- _scanner.readChar ();
1094
+ _scanner.readCodePoint ();
1095
1095
1096
1096
// Check for an indentation indicator.
1097
1097
if (_isDigit) {
@@ -1101,7 +1101,7 @@ class Scanner {
1101
1101
_scanner.spanFrom (start));
1102
1102
}
1103
1103
1104
- increment = _scanner.readChar () - NUMBER_0 ;
1104
+ increment = _scanner.readCodePoint () - NUMBER_0 ;
1105
1105
}
1106
1106
} else if (_isDigit) {
1107
1107
// Do the same as above, but in the opposite order.
@@ -1110,12 +1110,12 @@ class Scanner {
1110
1110
_scanner.spanFrom (start));
1111
1111
}
1112
1112
1113
- increment = _scanner.readChar () - NUMBER_0 ;
1113
+ increment = _scanner.readCodePoint () - NUMBER_0 ;
1114
1114
1115
1115
char = _scanner.peekChar ();
1116
1116
if (char == PLUS || char == HYPHEN ) {
1117
1117
chomping = char == PLUS ? _Chomping .keep : _Chomping .strip;
1118
- _scanner.readChar ();
1118
+ _scanner.readCodePoint ();
1119
1119
}
1120
1120
}
1121
1121
@@ -1182,7 +1182,7 @@ class Scanner {
1182
1182
1183
1183
var startPosition = _scanner.position;
1184
1184
while (! _isBreakOrEnd) {
1185
- _scanner.readChar ();
1185
+ _scanner.readCodePoint ();
1186
1186
}
1187
1187
buffer.write (_scanner.substring (startPosition));
1188
1188
end = _scanner.state;
@@ -1373,7 +1373,7 @@ class Scanner {
1373
1373
buffer.writeCharCode (value);
1374
1374
}
1375
1375
} else {
1376
- buffer.writeCharCode (_scanner.readChar ());
1376
+ buffer.writeCharCode (_scanner.readCodePoint ());
1377
1377
}
1378
1378
}
1379
1379
@@ -1462,7 +1462,7 @@ class Scanner {
1462
1462
// 1.2's. We use [_isPlainChar] instead of libyaml's character here.
1463
1463
var startPosition = _scanner.position;
1464
1464
while (_isPlainChar) {
1465
- _scanner.readChar ();
1465
+ _scanner.readCodePoint ();
1466
1466
}
1467
1467
buffer.write (_scanner.substring (startPosition));
1468
1468
end = _scanner.state;
@@ -1587,15 +1587,28 @@ class Scanner {
1587
1587
_inBlockContext,
1588
1588
SP || TAB || LF || CR || BOM => false ,
1589
1589
NEL => true ,
1590
- _ => _isStandardCharacter (char )
1590
+ _ => _isStandardCharacterAt (offset )
1591
1591
};
1592
1592
}
1593
1593
1594
+ bool _isStandardCharacterAt (int offset) {
1595
+ var first = _scanner.peekChar (offset);
1596
+ if (first == null ) return false ;
1597
+
1598
+ if (isHighSurrogate (first)) {
1599
+ var next = _scanner.peekChar (offset + 1 );
1600
+ // A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
1601
+ // must be a standard character.
1602
+ return next != null && isLowSurrogate (next);
1603
+ }
1604
+
1605
+ return _isStandardCharacter (first);
1606
+ }
1607
+
1594
1608
bool _isStandardCharacter (int char) =>
1595
- (char >= 0x00020 && char <= 0x00007E ) ||
1596
- (char >= 0x000A0 && char <= 0x00D7FF ) ||
1597
- (char >= 0x0E000 && char <= 0x00FFFD ) ||
1598
- (char >= 0x10000 && char <= 0x10FFFF );
1609
+ (char >= 0x0020 && char <= 0x007E ) ||
1610
+ (char >= 0x00A0 && char <= 0xD7FF ) ||
1611
+ (char >= 0xE000 && char <= 0xFFFD );
1599
1612
1600
1613
/// Returns the hexidecimal value of [char] .
1601
1614
int _asHex (int char) {
0 commit comments