Skip to content

Commit 53dfec5

Browse files
committed
ICU-23297 Do not allow lookupMatcher to remap UnicodeSet syntax characters
1 parent 282b2f1 commit 53dfec5

File tree

2 files changed

+40
-97
lines changed

2 files changed

+40
-97
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 16 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,8 @@ class UnicodeSet::Lexer {
314314
BRACKETED_ELEMENT,
315315
STRING_LITERAL,
316316
PROPERTY_QUERY,
317-
// ICU extension: A literal-element, escaped-element, or set-operator or (but not
318-
// bracketed-element) which is mapped to a set. This may also be an unescaped '{', in which
319-
// case bracketed-element and string-literal are inaccessible.
317+
// ICU extension: A literal-element or escaped-element (but not
318+
// bracketed-element) which is mapped to a set.
320319
STAND_IN,
321320
END_OF_TEXT,
322321
};
@@ -358,14 +357,6 @@ class UnicodeSet::Lexer {
358357
return false;
359358
}
360359

361-
bool acceptStandInWithSymbol(char16_t op) {
362-
if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) {
363-
advance();
364-
return true;
365-
}
366-
return false;
367-
}
368-
369360
const LexicalElement &lookahead() {
370361
if (!ahead_.has_value()) {
371362
const RuleCharacterIterator::Pos before = getPos();
@@ -441,7 +432,6 @@ class UnicodeSet::Lexer {
441432
UBool unusedEscaped;
442433
const UChar32 first =
443434
chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode);
444-
// '[', named-element, and property-query cannot be disabled by stand-in.
445435
if (first == u'[' || first == u'\\') {
446436
const RuleCharacterIterator::Pos afterFirst = getPos();
447437
// This could be a property-query or named-element.
@@ -467,14 +457,13 @@ class UnicodeSet::Lexer {
467457
// Not a property-query.
468458
chars_.setPos(afterFirst);
469459
}
470-
if (first == u'[') {
460+
switch (first) {
461+
case u'[':
471462
return LexicalElement(
472463
LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
473464
/*standIn=*/nullptr,
474465
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
475-
}
476-
477-
if (first == u'\\') {
466+
case u'\\': {
478467
// Now try to parse the escape.
479468
chars_.setPos(before);
480469
UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode);
@@ -487,17 +476,6 @@ class UnicodeSet::Lexer {
487476
standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), getPos(), errorCode,
488477
standIn, std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
489478
}
490-
if (symbols_ != nullptr) {
491-
const UnicodeSet *const standIn =
492-
dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
493-
if (standIn != nullptr) {
494-
return LexicalElement(
495-
LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn,
496-
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
497-
}
498-
}
499-
500-
switch (first) {
501479
case u'&':
502480
case u'-':
503481
case u']':
@@ -532,6 +510,15 @@ class UnicodeSet::Lexer {
532510
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
533511
}
534512
default:
513+
if (symbols_ != nullptr) {
514+
const UnicodeSet *const standIn =
515+
dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
516+
if (standIn != nullptr) {
517+
return LexicalElement(
518+
LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn,
519+
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
520+
}
521+
}
535522
return LexicalElement(
536523
LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr,
537524
std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
@@ -750,13 +737,9 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
750737
// Extension:
751738
// | stand-in
752739
// Where a stand-in may be a character or an escape.
753-
// Strings that would match stand-in effectively get removed from
754-
// all other terminals of the grammar, except [.
755-
// When mapped by the symbol table, whether ^ and - are treated as set operators depends on where
756-
// in the grammar we are, hence `acceptStandInWithSymbol`.
757740
if (lexer.acceptSetOperator(u'[')) {
758741
prettyPrintedPattern.append(u'[');
759-
if (lexer.acceptSetOperator(u'^') || lexer.acceptStandInWithSymbol(u'^')) {
742+
if (lexer.acceptSetOperator(u'^')) {
760743
prettyPrintedPattern.append(u'^');
761744
isComplement = true;
762745
}
@@ -812,7 +795,7 @@ void UnicodeSet::parseUnion(Lexer &lexer,
812795
// | UnescapedHyphenMinus Terms UnescapedHyphenMinus
813796
// Terms ::= ""
814797
// | Terms Term
815-
if (lexer.acceptSetOperator(u'-') || lexer.acceptStandInWithSymbol(u'-')) {
798+
if (lexer.acceptSetOperator(u'-')) {
816799
add(u'-');
817800
// When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
818801
// final one, for consistency with older ICU behaviour.

icu4c/source/test/intltest/usettest.cpp

Lines changed: 24 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1962,42 +1962,42 @@ void UnicodeSetTest::TestLookupSymbolTable() {
19621962
for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
19631963
expectedLookups, variables] : std::vector<TestCase>{
19641964
{u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}},
1965-
{u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}},
1966-
{u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
1965+
{u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'1'}},
1966+
{u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'0'}},
19671967
// A call to lookupMatcher with the first character of the content of a variable happens
19681968
// immediately after a corresponding call to lookup, although we may lookup the variable
19691969
// several times before we call lookupMatcher.
19701970
{u"[0-$one]",
19711971
U_ZERO_ERROR,
19721972
u"[[a-z]-[bc]]",
19731973
u"[ad-z]",
1974-
{u'0', u'-', u"one", u'1', u']'},
1974+
{u'0', u"one", u'1'},
19751975
{{u"zero", u"0"}, {u"one", u"1"}}},
19761976
{u"[$zero-$one]",
19771977
U_ZERO_ERROR,
19781978
u"[[a-z]-[bc]]",
19791979
u"[ad-z]",
1980-
{u"zero", u"zero", u'0', u'-', u"one", u'1', u']'},
1980+
{u"zero", u"zero", u'0', u"one", u'1'},
19811981
{{u"zero", u"0"}, {u"one", u"1"}}},
19821982
// If the variable expands to multiple symbols, only the first one is sequenced right after
19831983
// the variable lookup.
19841984
{u"[$ten]",
19851985
U_ZERO_ERROR,
19861986
u"[[bc][a-z]]",
19871987
u"[a-z]",
1988-
{u"ten", u"ten", u'1', u'0', u']'},
1988+
{u"ten", u"ten", u'1', u'0'},
19891989
{{u"ten", u"10"}}},
19901990
// Substitution of lookupMatcher symbols takes place after unescaping.
1991-
{uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
1991+
{uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'0'}},
19921992
// It does not take place in string literals.
1993-
{uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}},
1994-
{uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}},
1993+
{uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'/'}},
1994+
{uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'1'}},
19951995
{uR"([ 21 ])",
19961996
U_ZERO_ERROR,
19971997
u"[[: Co :][bc]]",
19981998
u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]",
1999-
{u'2', u'1', u']'}},
2000-
{u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}},
1999+
{u'2', u'1'}},
2000+
{u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'b', u'1'}},
20012001
}) {
20022002
symbols.setVariables(variables);
20032003
symbols.clearLookupTrace();
@@ -2033,13 +2033,12 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20332033
errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual : " + actual);
20342034
}
20352035
}
2036-
// Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea
2037-
// to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
2038-
// change it unknowingly.
2036+
// Defining syntax characters as symbols has no effect on syntax.
20392037
symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode));
20402038
symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode));
20412039
// This one is never used, except if escaped.
20422040
symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode));
2041+
symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
20432042
symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode));
20442043
symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
20452044
symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
@@ -2049,35 +2048,22 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20492048
symbols.add(u'p', UnicodeSet(u"[{latinSmallLetterP}]", errorCode));
20502049
for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
20512050
expectedLookups, variables] : std::vector<TestCase>{
2052-
{u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
2051+
{u"-", U_MALFORMED_SET, u"[]", u"[]"},
20532052
{u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
2054-
// The hyphen no longer works as set difference.
2055-
{u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
2056-
{u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
2057-
// An initial HYPHEN-MINUS is still treated as a literal '-', but a final one is treated
2058-
// as a set.
2053+
{u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
2054+
{u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
20592055
{u"[-1]", U_ZERO_ERROR, uR"([\-[bc]])", uR"([\-bc])"},
2060-
{u"[1-]", U_ZERO_ERROR, u"[[bc][{hyphenMinus}]]", u"[bc{hyphenMinus}]"},
2061-
// String literals no longer work.
2062-
{uR"([!-/{0}])", U_ZERO_ERROR,
2063-
u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
2064-
u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
2065-
// The ampersand no longer works as set intersection.
2066-
{uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
2067-
u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
2068-
// Complementing still works.
2056+
{u"[1-]", U_ZERO_ERROR, u"[[bc]-]", uR"([\-bc])"},
2057+
{uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
2058+
{uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
20692059
{uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
2070-
uR"([\u0001-\U0010FFFF])"},
2071-
// ^ elsewhere becomes a symbol rather than a syntax error.
2072-
{uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
2073-
uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
2074-
// Opening brackets still work.
2075-
{uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
2076-
// The only way to access the [ symbol is via escaping.
2060+
uR"([\u0001-\U0010FFFF])"},
2061+
{uR"([\u0000 ^ -])", U_MALFORMED_SET, uR"([\u0000])", uR"([\u0000])"},
2062+
{uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", u"[]"},
2063+
// An escape can access any mapped character, even if the unescaped
2064+
// character would be syntax.
20772065
{uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
2078-
// Anchors are gone.
2079-
{uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
2080-
// Property queries are unaffected.
2066+
{uR"([$])", U_ZERO_ERROR, uR"([$])", uR"([\uFFFF])"},
20812067
{u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
20822068
{uR"(\p{Co})", U_ZERO_ERROR, uR"(\p{Co})", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
20832069
}) {
@@ -2098,32 +2084,6 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20982084
", got " + actual);
20992085
}
21002086
}
2101-
// If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
2102-
// constructor returns an error but not an empty set. Don’t do that.
2103-
symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
2104-
for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
2105-
expectedLookups, variables] : std::vector<TestCase>{
2106-
{u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
2107-
{u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
2108-
{u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
2109-
}) {
2110-
UnicodeString actual;
2111-
UErrorCode errorCode = U_ZERO_ERROR;
2112-
const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
2113-
if (errorCode != expectedErrorCode) {
2114-
errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
2115-
u_errorName(errorCode));
2116-
}
2117-
if (set.toPattern(actual) != expectedPattern) {
2118-
errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
2119-
", got " + actual);
2120-
}
2121-
if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
2122-
errln(u"UnicodeSet(R\"(" + expression +
2123-
u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
2124-
", got " + actual);
2125-
}
2126-
}
21272087
#pragma GCC diagnostic pop
21282088
}
21292089

0 commit comments

Comments
 (0)