Skip to content

Commit 8f82fac

Browse files
committed
ICU-22898 MF2 parser bug fixes
ICU4C: Escape curly braces when serializing and normalizing ICU4C: Escape '|' in patterns ICU4C: When normalizing input, escape optionally-escaped characters in patterns ICU4C/ICU4J: Allow trailing whitespace after a match ICU4C: Fix parser to iterate over code points, not code units Add tests with old reserved syntax as syntax-error tests
1 parent 9adcf8b commit 8f82fac

File tree

9 files changed

+408
-322
lines changed

9 files changed

+408
-322
lines changed

icu4c/source/i18n/messageformat2_parser.cpp

Lines changed: 321 additions & 299 deletions
Large diffs are not rendered by default.

icu4c/source/i18n/messageformat2_parser.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,21 @@ namespace message2 {
140140
SelectorKeys parseNonEmptyKeys(UErrorCode&);
141141
void errorPattern(UErrorCode& status);
142142
Pattern parseQuotedPattern(UErrorCode&);
143+
bool isDeclarationStart();
144+
145+
UChar32 peek() const { return source.char32At(index) ; }
146+
UChar32 peek(uint32_t i) const {
147+
return source.char32At(source.moveIndex32(index, i));
148+
}
149+
void next() { index = source.moveIndex32(index, 1); }
150+
151+
bool inBounds() const { return (int32_t) index < source.length(); }
152+
bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); }
153+
bool allConsumed() const { return (int32_t) index == source.length(); }
143154

144155
// The input string
145156
const UnicodeString &source;
146-
// The current position within the input string
157+
// The current position within the input string -- counting in UChar32
147158
uint32_t index;
148159
// Represents the current line (and when an error is indicated),
149160
// character offset within the line of the parse error

icu4c/source/i18n/messageformat2_serializer.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,26 @@ void Serializer::emit(const std::u16string_view& token) {
4242
void Serializer::emit(const Literal& l) {
4343
if (l.isQuoted()) {
4444
emit(PIPE);
45-
const UnicodeString& contents = l.unquoted();
46-
for (int32_t i = 0; i < contents.length(); i++) {
47-
// Re-escape any PIPE or BACKSLASH characters
45+
}
46+
const UnicodeString& contents = l.unquoted();
47+
for (int32_t i = 0; ((int32_t) i) < contents.length(); i++) {
48+
// Re-escape any escaped-char characters
4849
switch(contents[i]) {
4950
case BACKSLASH:
50-
case PIPE: {
51-
emit(BACKSLASH);
52-
break;
51+
case PIPE:
52+
case LEFT_CURLY_BRACE:
53+
case RIGHT_CURLY_BRACE: {
54+
emit(BACKSLASH);
55+
break;
5356
}
5457
default: {
55-
break;
58+
break;
5659
}
5760
}
5861
emit(contents[i]);
59-
}
60-
emit(PIPE);
61-
} else {
62-
emit(l.unquoted());
62+
}
63+
if (l.isQuoted()) {
64+
emit(PIPE);
6365
}
6466
}
6567

@@ -194,9 +196,10 @@ void Serializer::emit(const PatternPart& part) {
194196
if (part.isText()) {
195197
// Raw text
196198
const UnicodeString& text = part.asText();
197-
// Re-escape '{'/'}'/'\'
198-
for (int32_t i = 0; i < text.length(); i++) {
199+
// Re-escape '{'/'}'/'\''|'
200+
for (int32_t i = 0; ((int32_t) i) < text.length(); i++) {
199201
switch(text[i]) {
202+
case PIPE:
200203
case BACKSLASH:
201204
case LEFT_CURLY_BRACE:
202205
case RIGHT_CURLY_BRACE: {

icu4c/source/test/intltest/messageformat2test_read_json.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,8 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
303303
// Do tests for reserved statements/expressions
304304
runTestsFromJsonFile(*this, "spec/unsupported-expressions.json", errorCode);
305305
runTestsFromJsonFile(*this, "spec/unsupported-statements.json", errorCode);
306+
// Uncomment when reserved syntax is removed
307+
// runTestsFromJsonFile(*this, "syntax-errors-reserved.json", errorCode);
306308

307309
// Do valid spec tests
308310
runTestsFromJsonFile(*this, "spec/syntax.json", errorCode);

icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -613,10 +613,8 @@ private MFDataModel.Variant getVariant() throws MFParseException {
613613
}
614614
keys.add(key);
615615
}
616-
// Only want to skip whitespace if we parsed at least one key
617-
if (!keys.isEmpty()) {
618-
skipOptionalWhitespaces();
619-
}
616+
// Trailing whitespace is allowed after the message
617+
skipOptionalWhitespaces();
620618
if (input.atEnd()) {
621619
checkCondition(
622620
keys.isEmpty(), "After selector keys it is mandatory to have a pattern.");

icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/CoreTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ public class CoreTest extends CoreTestFmwk {
4040
"syntax-errors-diagnostics.json",
4141
"syntax-errors-diagnostics-multiline.json",
4242
"syntax-errors-end-of-input.json",
43+
"syntax-errors-reserved.json",
4344
"tricky-declarations.json",
4445
"valid-tests.json"};
4546

testdata/message2/syntax-errors-diagnostics.json

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,6 @@
136136
"char": 28,
137137
"comment": "Trailing characters that are not whitespace"
138138
},
139-
{
140-
"src": ".match {$foo :string} {$bar :string} one * {{one}} * * {{other}} ",
141-
"char": 66,
142-
"comment": "Trailing whitespace at end of message should not be accepted either"
143-
},
144139
{
145140
"src": "empty { }",
146141
"char": 8,
@@ -343,6 +338,10 @@
343338
"src": ".match {1} {{_}}",
344339
"char": 12,
345340
"comment": "Disambiguating a wrong .match from an unsupported statement"
341+
},
342+
{
343+
"src": "{{{/p o4.􍅲 = 1}}}",
344+
"char": 9
346345
}
347346
]
348347
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"scenario": "Reserved and private annotations",
3+
"description": "Tests for old unsupported expression (reserved/private) syntax -- these are now syntax errors",
4+
"defaultTestProperties": {
5+
"locale": "en-US",
6+
"expErrors": [
7+
{
8+
"type": "syntax-error"
9+
}
10+
]
11+
},
12+
"tests": [
13+
{ "src" : "\\\\{ ? 󗟋 𫓜|@} |}" },
14+
{ "src" : "\\\\{ ? 󗟋 𫓜|@} | \\} @𠟅򑚎𥪙𽧫=|| @򒘒򳷦㥞򉊷=$򸚶񽱆񅗽񤕞 @𰺱:񎫛񢶛򶈎񄮒}" },
15+
{ "src" : "{ $iFN ^ @USh =$u @l}" },
16+
{ "src" : ".local $dS ={ $p4 ^ |.| \\\\ @g:FV = $kd}{{@}}" },
17+
{ "src" : ".D. \\\\ ||{1}{{}} " },
18+
{ "src" : ".cIT ||{|@| % \\} } { *󔜫񥘃󸇀 }{{}}" },
19+
{ "src" : ".򱋿󠆿 . |@| {$󛒁񓝖 & \\{ . @𯼼򋼡= $򵀀񓞈}{{\\\\}}" }
20+
]
21+
}
22+

testdata/message2/valid-tests.json

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,34 @@
123123
"comment": "message -> complex-message -> *(declaration [s]) complex-body -> declaration complex-body -> input-declaration complex-body -> input variable-expression complex-body",
124124
"src": ".input{$x}{{}}",
125125
"exp": ""
126+
},
127+
{
128+
"comment": "Test that escaped characters are re-escaped properly when serializing literals",
129+
"src": "{|\\}|}",
130+
"exp": "}"
131+
},
132+
{
133+
"comment": "Test that escaped characters are re-escaped properly when serializing patterns",
134+
"src": "\\|",
135+
"exp": "|"
136+
},
137+
{
138+
"comment": "Test that parser and serializer treat optionally-escaped characters consistently",
139+
"src" : "{{{1}|}}",
140+
"exp": "1|"
141+
},
142+
{
143+
"comment": "Trailing whitespace after match is valid",
144+
"src": ".match {1 :string} * {{}} ",
145+
"exp": ""
146+
},
147+
{
148+
"src" : "𴢸",
149+
"exp" : "𴢸"
150+
},
151+
{
152+
"src" : "{{􍅲}}",
153+
"exp" : "􍅲"
126154
}
127155
]
128156
}

0 commit comments

Comments
 (0)