Skip to content

Commit a0f9e57

Browse files
authored
Handle surrogate pairs during scanning (dart-archive/yaml#159)
Change back to readChar() whenever possible; remove the need to decode the surrogate for further checking
1 parent 7440807 commit a0f9e57

File tree

5 files changed

+54
-29
lines changed

5 files changed

+54
-29
lines changed

pkgs/yaml/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
## 3.1.3-wip
22

33
* Require Dart 3.4
4+
* Fix UTF-16 surrogate pair handling in plain scaler.
45

56
## 3.1.2
67

pkgs/yaml/lib/src/scanner.dart

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ class Scanner {
253253
null => false,
254254
LF || CR || BOM => false,
255255
TAB || NEL => true,
256-
_ => _isStandardCharacter(char),
256+
_ => _isStandardCharacterAt(0),
257257
};
258258
}
259259

@@ -267,7 +267,7 @@ class Scanner {
267267
null => false,
268268
LF || CR || BOM || SP => false,
269269
NEL => true,
270-
_ => _isStandardCharacter(char),
270+
_ => _isStandardCharacterAt(0),
271271
};
272272
}
273273

@@ -614,9 +614,9 @@ class Scanner {
614614

615615
// Consume the indicator token.
616616
var start = _scanner.state;
617-
_scanner.readChar();
618-
_scanner.readChar();
619-
_scanner.readChar();
617+
_scanner.readCodePoint();
618+
_scanner.readCodePoint();
619+
_scanner.readCodePoint();
620620

621621
_tokens.add(Token(type, _scanner.spanFrom(start)));
622622
}
@@ -732,7 +732,7 @@ class Scanner {
732732
/// The span of the new token is the current character.
733733
void _addCharToken(TokenType type) {
734734
var start = _scanner.state;
735-
_scanner.readChar();
735+
_scanner.readCodePoint();
736736
_tokens.add(Token(type, _scanner.spanFrom(start)));
737737
}
738738

@@ -836,7 +836,7 @@ class Scanner {
836836
// libyaml doesn't support unknown directives, but the spec says to ignore
837837
// them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
838838
while (!_isBreakOrEnd) {
839-
_scanner.readChar();
839+
_scanner.readCodePoint();
840840
}
841841

842842
return null;
@@ -866,7 +866,7 @@ class Scanner {
866866
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
867867
var start = _scanner.position;
868868
while (_isNonSpace) {
869-
_scanner.readChar();
869+
_scanner.readCodePoint();
870870
}
871871

872872
var name = _scanner.substring(start);
@@ -941,13 +941,13 @@ class Scanner {
941941
var start = _scanner.state;
942942

943943
// Eat the indicator character.
944-
_scanner.readChar();
944+
_scanner.readCodePoint();
945945

946946
// libyaml only allows word characters in anchor names, but the spec
947947
// disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
948948
var startPosition = _scanner.position;
949949
while (_isAnchorChar) {
950-
_scanner.readChar();
950+
_scanner.readCodePoint();
951951
}
952952
var name = _scanner.substring(startPosition);
953953

@@ -1032,7 +1032,7 @@ class Scanner {
10321032
buffer.write(_scanner.substring(start));
10331033

10341034
if (_scanner.peekChar() == EXCLAMATION) {
1035-
buffer.writeCharCode(_scanner.readChar());
1035+
buffer.writeCharCode(_scanner.readCodePoint());
10361036
} else {
10371037
// It's either the '!' tag or not really a tag handle. If it's a %TAG
10381038
// directive, it's an error. If it's a tag token, it must be part of a
@@ -1083,15 +1083,15 @@ class Scanner {
10831083
var start = _scanner.state;
10841084

10851085
// Eat the indicator '|' or '>'.
1086-
_scanner.readChar();
1086+
_scanner.readCodePoint();
10871087

10881088
// Check for a chomping indicator.
10891089
var chomping = _Chomping.clip;
10901090
var increment = 0;
10911091
var char = _scanner.peekChar();
10921092
if (char == PLUS || char == HYPHEN) {
10931093
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
1094-
_scanner.readChar();
1094+
_scanner.readCodePoint();
10951095

10961096
// Check for an indentation indicator.
10971097
if (_isDigit) {
@@ -1101,7 +1101,7 @@ class Scanner {
11011101
_scanner.spanFrom(start));
11021102
}
11031103

1104-
increment = _scanner.readChar() - NUMBER_0;
1104+
increment = _scanner.readCodePoint() - NUMBER_0;
11051105
}
11061106
} else if (_isDigit) {
11071107
// Do the same as above, but in the opposite order.
@@ -1110,12 +1110,12 @@ class Scanner {
11101110
_scanner.spanFrom(start));
11111111
}
11121112

1113-
increment = _scanner.readChar() - NUMBER_0;
1113+
increment = _scanner.readCodePoint() - NUMBER_0;
11141114

11151115
char = _scanner.peekChar();
11161116
if (char == PLUS || char == HYPHEN) {
11171117
chomping = char == PLUS ? _Chomping.keep : _Chomping.strip;
1118-
_scanner.readChar();
1118+
_scanner.readCodePoint();
11191119
}
11201120
}
11211121

@@ -1182,7 +1182,7 @@ class Scanner {
11821182

11831183
var startPosition = _scanner.position;
11841184
while (!_isBreakOrEnd) {
1185-
_scanner.readChar();
1185+
_scanner.readCodePoint();
11861186
}
11871187
buffer.write(_scanner.substring(startPosition));
11881188
end = _scanner.state;
@@ -1373,7 +1373,7 @@ class Scanner {
13731373
buffer.writeCharCode(value);
13741374
}
13751375
} else {
1376-
buffer.writeCharCode(_scanner.readChar());
1376+
buffer.writeCharCode(_scanner.readCodePoint());
13771377
}
13781378
}
13791379

@@ -1462,7 +1462,7 @@ class Scanner {
14621462
// 1.2's. We use [_isPlainChar] instead of libyaml's character here.
14631463
var startPosition = _scanner.position;
14641464
while (_isPlainChar) {
1465-
_scanner.readChar();
1465+
_scanner.readCodePoint();
14661466
}
14671467
buffer.write(_scanner.substring(startPosition));
14681468
end = _scanner.state;
@@ -1587,15 +1587,28 @@ class Scanner {
15871587
_inBlockContext,
15881588
SP || TAB || LF || CR || BOM => false,
15891589
NEL => true,
1590-
_ => _isStandardCharacter(char)
1590+
_ => _isStandardCharacterAt(offset)
15911591
};
15921592
}
15931593

1594+
bool _isStandardCharacterAt(int offset) {
1595+
var first = _scanner.peekChar(offset);
1596+
if (first == null) return false;
1597+
1598+
if (isHighSurrogate(first)) {
1599+
var next = _scanner.peekChar(offset + 1);
1600+
// A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
1601+
// must be a standard character.
1602+
return next != null && isLowSurrogate(next);
1603+
}
1604+
1605+
return _isStandardCharacter(first);
1606+
}
1607+
15941608
bool _isStandardCharacter(int char) =>
1595-
(char >= 0x00020 && char <= 0x00007E) ||
1596-
(char >= 0x000A0 && char <= 0x00D7FF) ||
1597-
(char >= 0x0E000 && char <= 0x00FFFD) ||
1598-
(char >= 0x10000 && char <= 0x10FFFF);
1609+
(char >= 0x0020 && char <= 0x007E) ||
1610+
(char >= 0x00A0 && char <= 0xD7FF) ||
1611+
(char >= 0xE000 && char <= 0xFFFD);
15991612

16001613
/// Returns the hexidecimal value of [char].
16011614
int _asHex(int char) {

pkgs/yaml/lib/src/utils.dart

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,9 @@ YamlWarningCallback yamlWarningCallback = (message, [SourceSpan? span]) {
4343
if (span != null) message = span.message(message);
4444
print(message);
4545
};
46+
47+
/// Whether [codeUnit] is a UTF-16 high surrogate.
48+
bool isHighSurrogate(int codeUnit) => codeUnit >>> 10 == 0x36;
49+
50+
/// Whether [codeUnit] is a UTF-16 low surrogate.
51+
bool isLowSurrogate(int codeUnit) => codeUnit >>> 10 == 0x37;

pkgs/yaml/pubspec.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ environment:
1212
dependencies:
1313
collection: ^1.15.0
1414
source_span: ^1.8.0
15-
string_scanner: ^1.1.0
15+
string_scanner: ^1.2.0
1616

1717
dev_dependencies:
1818
dart_flutter_team_lints: ^3.0.0

pkgs/yaml/test/yaml_test.dart

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -420,20 +420,25 @@ void main() {
420420

421421
test('[Example 2.17]', () {
422422
expectYamlLoads({
423-
'unicode': 'Sosa did fine.\u263A',
423+
'unicode': 'Sosa did fine.\u263A \u{1F680}',
424424
'control': '\b1998\t1999\t2000\n',
425425
'hex esc': '\r\n is \r\n',
426426
'single': '"Howdy!" he cried.',
427427
'quoted': " # Not a 'comment'.",
428-
'tie-fighter': '|\\-*-/|'
428+
'tie-fighter': '|\\-*-/|',
429+
'surrogate-pair': 'I \u{D83D}\u{DE03} ️Dart!',
430+
'key-\u{D83D}\u{DD11}': 'Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!',
429431
}, """
430-
unicode: "Sosa did fine.\\u263A"
432+
unicode: "Sosa did fine.\\u263A \\U0001F680"
431433
control: "\\b1998\\t1999\\t2000\\n"
432434
hex esc: "\\x0d\\x0a is \\r\\n"
433435
434436
single: '"Howdy!" he cried.'
435437
quoted: ' # Not a ''comment''.'
436-
tie-fighter: '|\\-*-/|'""");
438+
tie-fighter: '|\\-*-/|'
439+
440+
surrogate-pair: I \u{D83D}\u{DE03} ️Dart!
441+
key-\u{D83D}\u{DD11}: Look\u{D83D}\u{DE03}\u{D83C}\u{DF89}surprise!""");
437442
});
438443

439444
test('[Example 2.18]', () {

0 commit comments

Comments
 (0)