ICU-23297 Do not allow lookupMatcher to remap UnicodeSet syntax characters

eggrobin · eggrobin · commit 53dfec534d23 · 2025-12-19T18:25:46.000+01:00
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
@@ -314,9 +314,8 @@ class UnicodeSet::Lexer {
             BRACKETED_ELEMENT,
             STRING_LITERAL,
             PROPERTY_QUERY,
-            // ICU extension: A literal-element, escaped-element, or set-operator or (but not
-            // bracketed-element) which is mapped to a set.  This may also be an unescaped '{', in which
-            // case bracketed-element and string-literal are inaccessible.
+            // ICU extension: A literal-element or escaped-element (but not
+            // bracketed-element) which is mapped to a set.
             STAND_IN,
             END_OF_TEXT,
         };
@@ -358,14 +357,6 @@ class UnicodeSet::Lexer {
         return false;
     }
 
-    bool acceptStandInWithSymbol(char16_t op) {
-        if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) {
-            advance();
-            return true;
-        }
-        return false;
-    }
-
     const LexicalElement &lookahead() {
         if (!ahead_.has_value()) {
             const RuleCharacterIterator::Pos before = getPos();
@@ -441,7 +432,6 @@ class UnicodeSet::Lexer {
         UBool unusedEscaped;
         const UChar32 first =
             chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode);
-        // '[', named-element, and property-query cannot be disabled by stand-in.
         if (first == u'[' || first == u'\\') {
             const RuleCharacterIterator::Pos afterFirst = getPos();
             // This could be a property-query or named-element.
@@ -467,14 +457,13 @@ class UnicodeSet::Lexer {
             // Not a property-query.
             chars_.setPos(afterFirst);
         }
-        if (first == u'[') {
+        switch (first) {
+        case u'[':
             return LexicalElement(
                 LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
                 /*standIn=*/nullptr,
                 std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
-        }
-
-        if (first == u'\\') {
+        case u'\\': {
             // Now try to parse the escape.
             chars_.setPos(before);
             UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode);
@@ -487,17 +476,6 @@ class UnicodeSet::Lexer {
                 standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), getPos(), errorCode,
                 standIn, std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
-        if (symbols_ != nullptr) {
-            const UnicodeSet *const standIn =
-                dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
-            if (standIn != nullptr) {
-                return LexicalElement(
-                    LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn,
-                    std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
-            }
-        }
-
-        switch (first) {
         case u'&':
         case u'-':
         case u']':
@@ -532,6 +510,15 @@ class UnicodeSet::Lexer {
                 std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
         default:
+            if (symbols_ != nullptr) {
+                const UnicodeSet *const standIn =
+                    dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
+                if (standIn != nullptr) {
+                    return LexicalElement(
+                        LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn,
+                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
+                }
+            }
             return LexicalElement(
                 LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr,
                 std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
@@ -750,13 +737,9 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
         // Extension:
         //              | stand-in
         // Where a stand-in may be a character or an escape.
-        // Strings that would match stand-in effectively get removed from
-        // all other terminals of the grammar, except [.
-        // When mapped by the symbol table, whether ^ and - are treated as set operators depends on where
-        // in the grammar we are, hence `acceptStandInWithSymbol`.
         if (lexer.acceptSetOperator(u'[')) {
             prettyPrintedPattern.append(u'[');
-            if (lexer.acceptSetOperator(u'^') || lexer.acceptStandInWithSymbol(u'^')) {
+            if (lexer.acceptSetOperator(u'^')) {
                 prettyPrintedPattern.append(u'^');
                 isComplement = true;
             }
@@ -812,7 +795,7 @@ void UnicodeSet::parseUnion(Lexer &lexer,
     //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
     // Terms ::= ""
     //         | Terms Term
-    if (lexer.acceptSetOperator(u'-') || lexer.acceptStandInWithSymbol(u'-')) {
+    if (lexer.acceptSetOperator(u'-')) {
         add(u'-');
         // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
         // final one, for consistency with older ICU behaviour.
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
@@ -1962,42 +1962,42 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
                       expectedLookups, variables] : std::vector<TestCase>{
             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}},
-            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}},
-            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'1'}},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'0'}},
             // A call to lookupMatcher with the first character of the content of a variable happens
             // immediately after a corresponding call to lookup, although we may lookup the variable
             // several times before we call lookupMatcher.
             {u"[0-$one]",
             U_ZERO_ERROR,
             u"[[a-z]-[bc]]",
             u"[ad-z]",
-            {u'0', u'-', u"one", u'1', u']'},
+            {u'0', u"one", u'1'},
             {{u"zero", u"0"}, {u"one", u"1"}}},
             {u"[$zero-$one]",
             U_ZERO_ERROR,
             u"[[a-z]-[bc]]",
             u"[ad-z]",
-            {u"zero", u"zero", u'0', u'-', u"one", u'1', u']'},
+            {u"zero", u"zero", u'0', u"one", u'1'},
             {{u"zero", u"0"}, {u"one", u"1"}}},
             // If the variable expands to multiple symbols, only the first one is sequenced right after
             // the variable lookup.
             {u"[$ten]",
             U_ZERO_ERROR,
             u"[[bc][a-z]]",
             u"[a-z]",
-            {u"ten", u"ten", u'1', u'0', u']'},
+            {u"ten", u"ten", u'1', u'0'},
             {{u"ten", u"10"}}},
             // Substitution of lookupMatcher symbols takes place after unescaping.
-            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'0'}},
             // It does not take place in string literals.
-            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}},
-            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}},
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'/'}},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'1'}},
             {uR"([ 21 ])",
             U_ZERO_ERROR,
             u"[[: Co :][bc]]",
             u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]",
-            {u'2', u'1', u']'}},
-            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}},
+            {u'2', u'1'}},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'b', u'1'}},
         }) {
         symbols.setVariables(variables);
         symbols.clearLookupTrace();
@@ -2033,13 +2033,12 @@ void UnicodeSetTest::TestLookupSymbolTable() {
             errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual   : " + actual);
         }
     }
-    // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
-    // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
-    // change it unknowingly.
+    // Defining syntax characters as symbols has no effect on syntax.
     symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode));
     symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode));
     // This one is never used, except if escaped.
     symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode));
+    symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
     symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode));
     symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
     symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
@@ -2049,35 +2048,22 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'p', UnicodeSet(u"[{latinSmallLetterP}]", errorCode));
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
                       expectedLookups, variables] : std::vector<TestCase>{
-            {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
+            {u"-", U_MALFORMED_SET, u"[]", u"[]"},
             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
-            // The hyphen no longer works as set difference.
-            {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
-            {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
-            // An initial HYPHEN-MINUS is still treated as a literal '-', but a final one is treated
-            // as a set.
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
             {u"[-1]", U_ZERO_ERROR, uR"([\-[bc]])", uR"([\-bc])"},
-            {u"[1-]", U_ZERO_ERROR, u"[[bc][{hyphenMinus}]]", u"[bc{hyphenMinus}]"},
-            // String literals no longer work.
-            {uR"([!-/{0}])", U_ZERO_ERROR,
-            u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
-            u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
-            // The ampersand no longer works as set intersection.
-            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
-            u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
-            // Complementing still works.
+            {u"[1-]", U_ZERO_ERROR, u"[[bc]-]", uR"([\-bc])"},
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
             {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
-            uR"([\u0001-\U0010FFFF])"},
-            // ^ elsewhere becomes a symbol rather than a syntax error.
-            {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
-            uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
-            // Opening brackets still work.
-            {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
-            // The only way to access the [ symbol is via escaping.
+             uR"([\u0001-\U0010FFFF])"},
+            {uR"([\u0000 ^ -])", U_MALFORMED_SET, uR"([\u0000])", uR"([\u0000])"},
+            {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", u"[]"},
+            // An escape can access any mapped character, even if the unescaped
+            // character would be syntax.
             {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
-            // Anchors are gone.
-            {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
-            // Property queries are unaffected.
+            {uR"([$])", U_ZERO_ERROR, uR"([$])", uR"([\uFFFF])"},
             {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
             {uR"(\p{Co})", U_ZERO_ERROR, uR"(\p{Co})", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
         }) {
@@ -2098,32 +2084,6 @@ void UnicodeSetTest::TestLookupSymbolTable() {
                   ", got " + actual);
         }
     }
-    // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
-    // constructor returns an error but not an empty set. Don’t do that.
-    symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
-                      expectedLookups, variables] : std::vector<TestCase>{
-            {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
-            {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
-            {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
-        }) {
-        UnicodeString actual;
-        UErrorCode errorCode = U_ZERO_ERROR;
-        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
-        if (errorCode != expectedErrorCode) {
-            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
-                  u_errorName(errorCode));
-        }
-        if (set.toPattern(actual) != expectedPattern) {
-            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
-                  ", got " + actual);
-        }
-        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
-            errln(u"UnicodeSet(R\"(" + expression +
-                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
-                  ", got " + actual);
-        }
-    }
 #pragma GCC diagnostic pop
 }