From 185a38fd7ebaf13192ff2e0334129558526df0e9 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 11 Aug 2025 16:24:10 +0200
Subject: [PATCH 01/56] ICU-22851 Test the error paths in UnicodeSet parsing

---
 icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  2 +
 2 files changed, 63 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..da32687987e8 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
 
@@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() {
     // begin() & end() return USetElementIterator for which explicit APIs are tested via USet
     // in a header-only unit test file.
 }
+
+void UnicodeSetTest::TestParseErrors() {
+    for (const auto expression : std::vector<std::u16string_view>{
+             // Java error message: "Char expected after operator".
+             u"[a-[b]]",
+             // "Missing '['".
+             u"a-z",
+             // "Trailing '&'".
+             u"[[a]&]",
+             // "'-' not after char or set".
+             u"[[a]&-[z]]",
+             u"[[a]--[z]]",
+             u"[{aa}-{zz}]",
+             // "'&' not after set".
+             u"[a&z]",
+             u"[{aa}&{zz}]",
+             // "'^' not after '['"
+             u"[a^z]",  // TODO(egg): Exclude from literal-element in PDUTS61.
+             // "Missing operand after operator".
+             u"[a-{zz}]",
+             u"[[a]-{zz}]",
+             u"[[a]&{zz}]",
+             // "Invalid multicharacter string".
+             u"[{aa]",
+             // "Unquoted '$'".
+             u"[a-$]",
+             // "Invalid range".
+             u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
+             u"[z-a]",
+             // "Set expected after operator".
+             u"[[a]-z]",
+             u"[[a]&z]",
+             // "Missing ']'".
+             u"[a-z",
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_MALFORMED_SET) {
+            UnicodeString s;
+            errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+    for (const auto expression : std::vector<std::u16string_view>{
+             // Java error message: "Invalid property pattern".
+             u"[:]",
+             uR"(\p)"
+             u"[:^]",
+             uR"(\P)",
+             uR"(\N)",
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {
+            UnicodeString s;
+            errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+}
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..4c5b55a329bb 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestParseErrors();
+
 private:
 
     UBool toPatternAux(UChar32 start, UChar32 end);

From 6a650e7065fc286bb136a4996abd665c7005832e Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 11 Aug 2025 20:28:53 +0200
Subject: [PATCH 02/56] Call it a day

---
 icu4c/source/common/unicode/uniset.h |  52 ++++++-
 icu4c/source/common/uniset_props.cpp | 216 ++++++++++++++++++++++++---
 2 files changed, 237 insertions(+), 31 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 01ac901e3ba1..ddacaaa336ca 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1696,13 +1696,51 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                                  const SymbolTable* symbols,
                                  UErrorCode& status);
 
-    void applyPattern(RuleCharacterIterator& chars,
-                      const SymbolTable* symbols,
-                      UnicodeString& rebuiltPat,
-                      uint32_t options,
-                      UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
-                      int32_t depth,
-                      UErrorCode& ec);
+    // Recursive descent parsing.  These functions parse the syntactic categories matching their name in
+    // the base grammar of PD UTR #56 (before the highlighted changes are applied).  They add to *this
+    // the elements of the set that the parsed construct represents.
+    // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
+
+    void parseUnicodeSet(RuleCharacterIterator &chars,
+                         const SymbolTable *symbols,
+                         UnicodeString &rebuiltPat,
+                         uint32_t options,
+                         UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                         int32_t depth,
+                         UErrorCode &ec);
+
+    void parseUnion(RuleCharacterIterator &chars,
+                    const SymbolTable *symbols,
+                    UnicodeString &rebuiltPat,
+                    uint32_t options,
+                    UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                    int32_t depth,
+                    UErrorCode &ec);
+
+    void parseTerm(RuleCharacterIterator &chars,
+                   const SymbolTable *symbols,
+                   UnicodeString &rebuiltPat,
+                   uint32_t options,
+                   UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                   int32_t depth,
+                   UErrorCode &ec);
+
+    void parseRestriction(RuleCharacterIterator &chars,
+                          const SymbolTable *symbols,
+                          UnicodeString &rebuiltPat,
+                          uint32_t options,
+                          UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                          int32_t depth,
+                          UErrorCode &ec);
+
+    void parseElements(RuleCharacterIterator &chars,
+                       const SymbolTable *symbols,
+                       UnicodeString &rebuiltPat,
+                       uint32_t options,
+                       UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                       int32_t depth,
+                       UErrorCode &ec);
+
 
     void closeOverCaseInsensitive(bool simple);
     void closeOverAddCaseMappings();
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 532b17f5063f..64e397ee27fe 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
+    parseUnicodeSet(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
     if (U_FAILURE(status)) return;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
@@ -242,6 +242,14 @@ class UnicodeSetPointer {
 
 constexpr int32_t MAX_DEPTH = 100;
 
+constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
+    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES;
+    if ((unicodeSetOptions & USET_IGNORE_SPACE) != 0) {
+        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
+    }
+    return opts;
+}
+
 }  // namespace
 
 /**
@@ -258,13 +266,13 @@ constexpr int32_t MAX_DEPTH = 100;
  * @param options a bit mask of zero or more of the following:
  * IGNORE_SPACE, CASE.
  */
-void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
-                              const SymbolTable* symbols,
-                              UnicodeString& rebuiltPat,
-                              uint32_t options,
-                              UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
-                              int32_t depth,
-                              UErrorCode& ec) {
+void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars,
+                                 const SymbolTable* symbols,
+                                 UnicodeString& rebuiltPat,
+                                 uint32_t options,
+                                 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
+                                 int32_t depth,
+                                 UErrorCode& ec) {
     if (U_FAILURE(ec)) return;
     if (depth > MAX_DEPTH) {
         ec = U_ILLEGAL_ARGUMENT_ERROR;
@@ -275,27 +283,187 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
 
     // Recognized special forms for chars, sets: c-c s-s s&s
 
-    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
-                   RuleCharacterIterator::PARSE_ESCAPES;
-    if ((options & USET_IGNORE_SPACE) != 0) {
-        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
+    clear();
+
+    bool isComplement = false;
+
+    if (resemblesPropertyPattern(chars, charsOptions(options))) {
+        // UnicodeSet ::= property-query | named-singleton
+        applyPropertyPattern(chars, rebuiltPat, ec);
+        if (U_FAILURE(ec)) return;
+    } else {
+        UBool escaped = false;
+        // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
+        // UnicodeSet ::=                [   Union ]
+        //              | Complement ::= [ ^ Union ]
+        char16_t c = chars.next(charsOptions(options), escaped, ec);
+        if (U_FAILURE(ec)) return;
+        if (escaped || c != u'[') {
+          ec = U_MALFORMED_SET;
+          return;
+        }
+        RuleCharacterIterator::Pos afterBracket;
+        chars.getPos(afterBracket);
+        c = chars.next(charsOptions(options), escaped, ec);
+        if (U_FAILURE(ec)) return;
+        if (!escaped && c == u'^') {
+            isComplement = true;
+            return;
+        } else {
+            chars.setPos(afterBracket);
+        }
+        parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        if (U_FAILURE(ec)) return;
+        c = chars.next(charsOptions(options), escaped, ec);
+        if (U_FAILURE(ec)) return;
+        if (escaped || c != u']') {
+            ec = U_MALFORMED_SET;
+            return;
+        }
     }
 
-    UnicodeString patLocal, buf;
-    UBool usePat = false;
-    UnicodeSetPointer scratch;
-    RuleCharacterIterator::Pos backup;
+    /**
+     * Handle global flags (isComplement, case insensitivity).  If this
+     * pattern should be compiled case-insensitive, then we need
+     * to close over case BEFORE COMPLEMENTING.  This makes
+     * patterns like /[^abc]/i work.
+     */
+    if ((options & USET_CASE_MASK) != 0) {
+        (this->*caseClosure)(options);
+    }
+    if (isComplement) {
+        complement().removeAllStrings();  // code point complement
+    }
+}
 
-    // mode: 0=before [, 1=between [...], 2=after ]
-    // lastItem: 0=none, 1=char, 2=set
-    int8_t lastItem = 0, mode = 0;
-    UChar32 lastChar = 0;
-    char16_t op = 0;
+void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
+                            const SymbolTable *symbols,
+                            UnicodeString &rebuiltPat,
+                            uint32_t options,
+                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth,
+                            UErrorCode &ec) {
+    UBool escaped = false;
+    RuleCharacterIterator::Pos position;
+    chars.getPos(position);
+    // Union ::= Terms
+    //         | UnescapedHyphenMinus Terms
+    //         | Terms UnescapedHyphenMinus
+    //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
+    // Terms ::= ""
+    //         | Terms Term
+    char16_t c = chars.next(charsOptions(options), escaped, ec);
+    if (U_FAILURE(ec)) return;
+    if (!escaped && c == u'-') {
+        add(u'-');
+    } else {
+        chars.setPos(position);
+    }
+    for (;;) {
+        chars.getPos(position);
+        c = chars.next(charsOptions(options), escaped, ec);
+        if (U_FAILURE(ec)) return;
+        if (!escaped && c == u'-') {
+            // We can be here on the first iteration: [--] is allowed by the
+            // grammar and by the old parser.
+            add(u'-');
+            return;
+        }
+        chars.setPos(position);
+        if (!escaped && c == ']') {
+            return;
+        }
+        if (U_FAILURE(ec)) return;
+        parseTerm(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec);
+        if (U_FAILURE(ec)) return;
+    }
+}
 
-    UBool invert = false;
+void UnicodeSet::parseTerm(RuleCharacterIterator &chars,
+                           const SymbolTable *symbols,
+                           UnicodeString &rebuiltPat,
+                           uint32_t options,
+                           UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                           int32_t depth,
+                           UErrorCode &ec) {
+    UBool escaped = false;
+    RuleCharacterIterator::Pos termStart;
+    chars.getPos(termStart);
+    // Term ::= Elements
+    //        | Restriction
+    char16_t c = chars.next(charsOptions(options), escaped, ec);
+    if (!escaped && c == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
+        chars.setPos(termStart);
+        parseRestriction(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec);
+        if (U_FAILURE(ec)) return;
+    } else {
+    }
+}
 
-    clear();
+void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
+                                  const SymbolTable *symbols,
+                                  UnicodeString &rebuiltPat,
+                                  uint32_t options,
+                                  UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                                  int32_t depth, UErrorCode &ec) {
+    UBool escaped = false;
+    // Restriction ::= UnicodeSet
+    //               | Intersection ::= Restriction & UnicodeSet
+    //               | Difference   ::= Restriction - UnicodeSet
+    // Start by parsing the first UnicodeSet.
+    parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth + 1, ec);
+    if (U_FAILURE(ec)) return;
+    // Now keep looking for an operator that would continue the Restriction.
+    for (;;) {
+        RuleCharacterIterator::Pos beforeOperator;
+        chars.getPos(beforeOperator);
+        char16_t c = chars.next(charsOptions(options), escaped, ec);
+        if (U_FAILURE(ec)) return;
+        if (!escaped && c == u'&') {
+            // Intersection ::= Restriction & UnicodeSet
+            UnicodeSet rightHandSide;
+            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure,
+                                          depth + 1, ec);
+            if (U_FAILURE(ec)) return;
+            retainAll(rightHandSide);
+        } else if (!escaped && c == u'-') {
+            // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
+            // of a Difference or an UnescapedHyphenMinus in the enclosing Union.
+            RuleCharacterIterator::Pos afterOperator;
+            chars.getPos(afterOperator);
+            char16_t c = chars.next(charsOptions(options), escaped, ec);
+            if (U_FAILURE(ec)) return;
+            if (!escaped && c == u']') {
+                // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it.
+                chars.setPos(beforeOperator);
+                return;
+            }
+            chars.setPos(afterOperator);
+            // Difference ::= Restriction - UnicodeSet
+            UnicodeSet rightHandSide;
+            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure,
+                                          depth + 1, ec);
+            if (U_FAILURE(ec)) return;
+            removeAll(rightHandSide);
+        } else {
+            // Not an operator.
+            chars.setPos(beforeOperator);
+            return;
+        }
+    }
+}
 
+void UnicodeSet::parseElements(RuleCharacterIterator &chars,
+                               const SymbolTable *symbols,
+                               UnicodeString &rebuiltPat,
+                               uint32_t options,
+                               UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                               int32_t depth,
+                               UErrorCode &ec) {
+    UBool escaped = false;
+    
+}
+
+    #if 0
     while (mode != 2 && !chars.atEnd()) {
         U_ASSERT((lastItem == 0 && op == 0) ||
                  (lastItem == 1 && (op == 0 || op == u'-')) ||
@@ -652,7 +820,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
         // We likely ran out of memory. AHHH!
         ec = U_MEMORY_ALLOCATION_ERROR;
     }
-}
+#endif
 
 //----------------------------------------------------------------
 // Property set implementation

From 85b8b50761ba5ab8000d09e4f07ccf26cfd70f4f Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 20:13:54 +0200
Subject: [PATCH 03/56] Some progress, toPattern is wrong, but what is right?

---
 icu4c/source/common/unicode/uniset.h   |  13 +-
 icu4c/source/common/uniset_closure.cpp |   2 +-
 icu4c/source/common/uniset_props.cpp   | 268 ++++++++++++++++++++-----
 3 files changed, 226 insertions(+), 57 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index ddacaaa336ca..c4c96154fca2 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1696,9 +1696,16 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                                  const SymbolTable* symbols,
                                  UErrorCode& status);
 
-    // Recursive descent parsing.  These functions parse the syntactic categories matching their name in
-    // the base grammar of PD UTR #56 (before the highlighted changes are applied).  They add to *this
-    // the elements of the set that the parsed construct represents.
+    void applyPattern(RuleCharacterIterator &chars,
+                      const SymbolTable *symbols,
+                      UnicodeString &rebuiltPat,
+                      uint32_t options,
+                      UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                      UErrorCode &ec);
+
+    // Recursive descent parsing with no backtracking.  These functions parse the syntactic categories
+    // matching their name in the base grammar of PD UTR #56 (before the highlighted changes are
+    // applied).  They add to *this the elements of the set that the parsed construct represents.
     // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
 
     void parseUnicodeSet(RuleCharacterIterator &chars,
diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp
index ae777c5facdf..251276adaddb 100644
--- a/icu4c/source/common/uniset_closure.cpp
+++ b/icu4c/source/common/uniset_closure.cpp
@@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status);
+    applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
     if (U_FAILURE(status)) return *this;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 64e397ee27fe..054a1a932d03 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    parseUnicodeSet(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
+    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
     if (U_FAILURE(status)) return;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
@@ -250,6 +250,74 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
     return opts;
 }
 
+#if 0
+#define U_UNICODESET_TRACE(...)                                                                         \
+    struct UnicodeSetParserTrace {                                                                      \
+        char const *const symbol_;                                                                      \
+        int const depth_;                                                                               \
+        const UnicodeSet *const that_;                                                                  \
+        UnicodeSetParserTrace(char const *symbol, int depth, const UnicodeSet *that)                    \
+            : symbol_(symbol), depth_(depth), that_(that) {}                                            \
+        ~UnicodeSetParserTrace() {                                                                      \
+            UnicodeString ahead;                                                                        \
+            std::string aheadUTF8;                                                                      \
+            printf("%s%s\n", std::string(depth_ * 4, ' ').c_str(), symbol_);                            \
+            printf("%s\n", (UnicodeSet(*that_)                                                           \
+                               .complement()                                                            \
+                               .complement()                                                            \
+                               .toPattern(ahead)                                                        \
+                               .toUTF8String(aheadUTF8)                                                 \
+                               .c_str(),""));                                                               \
+        }                                                                                               \
+    };                                                                                                  \
+    UnicodeSetParserTrace unicodeSetParserTrace(                                                        \
+        std::string_view("" __VA_ARGS__).empty() ? __func__ + 5 : ("" __VA_ARGS__), depth, this);       \
+    do {                                                                                                \
+        char const *symbol = ("" __VA_ARGS__);                                                          \
+        if (std::string_view(symbol).empty()) {                                                         \
+            symbol = __func__ + 5;                                                                      \
+        }                                                                                               \
+        UnicodeString ahead;                                                                            \
+        std::string aheadUTF8;                                                                          \
+        printf("%s%s  > %s\n", std::string(depth * 4, ' ').c_str(), symbol,                             \
+               (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str());                           \
+        printf("%s\n", (UnicodeSet(*this)                                                                \
+                           .complement()                                                                \
+                           .complement()                                                                \
+                           .toPattern(ahead)                                                            \
+                           .toUTF8String(aheadUTF8)                                                     \
+                           .c_str(),""));                                                                   \
+    } while (false)
+#else
+#define U_UNICODESET_TRACE(...)                                                                         \
+    do {                                                                                                \
+    } while (false)
+#endif
+
+#define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
+    do {                                                                                                \
+        if (U_FAILURE(ec)) {                                                                            \
+            if (depth < 5) {                                                                            \
+                printf("--- at %s l. %d\n", __func__, __LINE__);                                        \
+            } else if (depth == 5 && std::string_view(__func__) == "parseUnicodeSet") {                 \
+                printf("--- [...]\n");                                                                  \
+            }                                                                                           \
+            return;                                                                                     \
+        }                                                                                               \
+    } while (false)
+#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec)                               \
+    do {                                                                                                \
+        std::string actualUTF8;                                                                         \
+        UnicodeString ahead;                                                                            \
+        std::string aheadUTF8;                                                                          \
+        printf("*** Expected %s, got '%s' %s\n", (expected),                                            \
+               UnicodeString(actual).toUTF8String(actualUTF8).c_str(),                                  \
+               (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str());                           \
+        printf("--- at %s l. %d\n", __func__, __LINE__);                                                \
+        (ec) = U_MALFORMED_SET;                                                                         \
+        return;                                                                                         \
+    } while (false)
+
 }  // namespace
 
 /**
@@ -266,59 +334,66 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
  * @param options a bit mask of zero or more of the following:
  * IGNORE_SPACE, CASE.
  */
+
+void UnicodeSet::applyPattern(RuleCharacterIterator &chars,
+                              const SymbolTable *symbols,
+                              UnicodeString &rebuiltPat,
+                              uint32_t options,
+                              UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                              UErrorCode &ec) {
+    if (U_FAILURE(ec)) return;
+    clear();
+    parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
+    _generatePattern(rebuiltPat, false);
+}
+
 void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars,
                                  const SymbolTable* symbols,
                                  UnicodeString& rebuiltPat,
                                  uint32_t options,
                                  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
-                                 int32_t depth,
-                                 UErrorCode& ec) {
-    if (U_FAILURE(ec)) return;
+                                 int32_t depth, UErrorCode &ec) {
+    U_UNICODESET_TRACE();
+
     if (depth > MAX_DEPTH) {
-        ec = U_ILLEGAL_ARGUMENT_ERROR;
-        return;
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(),
+                                             ("depth = " + std::to_string(depth)).c_str(), chars, ec);
     }
 
-    // Syntax characters: [ ] ^ - & { }
-
-    // Recognized special forms for chars, sets: c-c s-s s&s
-
-    clear();
-
     bool isComplement = false;
-
     if (resemblesPropertyPattern(chars, charsOptions(options))) {
-        // UnicodeSet ::= property-query | named-singleton
-        applyPropertyPattern(chars, rebuiltPat, ec);
-        if (U_FAILURE(ec)) return;
+        // UnicodeSet ::= property-query | named-element
+        U_UNICODESET_TRACE("property-query | named-element");
+        chars.skipIgnored(charsOptions(options));
+        UnicodeSet propertyQuery;
+        propertyQuery.applyPropertyPattern(chars, rebuiltPat, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
+        addAll(propertyQuery);
     } else {
         UBool escaped = false;
         // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
         // UnicodeSet ::=                [   Union ]
         //              | Complement ::= [ ^ Union ]
-        char16_t c = chars.next(charsOptions(options), escaped, ec);
-        if (U_FAILURE(ec)) return;
+        UChar32 c = chars.next(charsOptions(options), escaped, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
         if (escaped || c != u'[') {
-          ec = U_MALFORMED_SET;
-          return;
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
         }
         RuleCharacterIterator::Pos afterBracket;
         chars.getPos(afterBracket);
         c = chars.next(charsOptions(options), escaped, ec);
-        if (U_FAILURE(ec)) return;
+        U_UNICODESET_RETURN_IF_ERROR(ec);
         if (!escaped && c == u'^') {
             isComplement = true;
-            return;
         } else {
             chars.setPos(afterBracket);
         }
         parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
-        if (U_FAILURE(ec)) return;
+        U_UNICODESET_RETURN_IF_ERROR(ec);
         c = chars.next(charsOptions(options), escaped, ec);
-        if (U_FAILURE(ec)) return;
+        U_UNICODESET_RETURN_IF_ERROR(ec);
         if (escaped || c != u']') {
-            ec = U_MALFORMED_SET;
-            return;
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
         }
     }
 
@@ -342,6 +417,7 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
                             uint32_t options,
                             UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth,
                             UErrorCode &ec) {
+    U_UNICODESET_TRACE();
     UBool escaped = false;
     RuleCharacterIterator::Pos position;
     chars.getPos(position);
@@ -351,17 +427,17 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
     //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
     // Terms ::= ""
     //         | Terms Term
-    char16_t c = chars.next(charsOptions(options), escaped, ec);
-    if (U_FAILURE(ec)) return;
+    UChar32 c = chars.next(charsOptions(options), escaped, ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
     if (!escaped && c == u'-') {
         add(u'-');
     } else {
         chars.setPos(position);
     }
-    for (;;) {
+    while (!chars.atEnd()) {
         chars.getPos(position);
         c = chars.next(charsOptions(options), escaped, ec);
-        if (U_FAILURE(ec)) return;
+        U_UNICODESET_RETURN_IF_ERROR(ec);
         if (!escaped && c == u'-') {
             // We can be here on the first iteration: [--] is allowed by the
             // grammar and by the old parser.
@@ -372,9 +448,8 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
         if (!escaped && c == ']') {
             return;
         }
-        if (U_FAILURE(ec)) return;
-        parseTerm(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec);
-        if (U_FAILURE(ec)) return;
+        parseTerm(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
@@ -385,17 +460,20 @@ void UnicodeSet::parseTerm(RuleCharacterIterator &chars,
                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                            int32_t depth,
                            UErrorCode &ec) {
+    U_UNICODESET_TRACE();
     UBool escaped = false;
     RuleCharacterIterator::Pos termStart;
     chars.getPos(termStart);
     // Term ::= Elements
     //        | Restriction
-    char16_t c = chars.next(charsOptions(options), escaped, ec);
-    if (!escaped && c == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
-        chars.setPos(termStart);
-        parseRestriction(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec);
-        if (U_FAILURE(ec)) return;
+    const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
+    chars.setPos(termStart);
+    if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
+        parseRestriction(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
     } else {
+        parseElements(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
@@ -405,34 +483,37 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
                                   uint32_t options,
                                   UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                   int32_t depth, UErrorCode &ec) {
+    U_UNICODESET_TRACE();
     UBool escaped = false;
     // Restriction ::= UnicodeSet
     //               | Intersection ::= Restriction & UnicodeSet
     //               | Difference   ::= Restriction - UnicodeSet
     // Start by parsing the first UnicodeSet.
-    parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth + 1, ec);
-    if (U_FAILURE(ec)) return;
+    parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
     // Now keep looking for an operator that would continue the Restriction.
+    // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
+    // return.
     for (;;) {
         RuleCharacterIterator::Pos beforeOperator;
         chars.getPos(beforeOperator);
-        char16_t c = chars.next(charsOptions(options), escaped, ec);
-        if (U_FAILURE(ec)) return;
-        if (!escaped && c == u'&') {
+        const UChar32 op = chars.next(charsOptions(options), escaped, ec);
+        U_UNICODESET_RETURN_IF_ERROR(ec);
+        if (!escaped && op == u'&') {
             // Intersection ::= Restriction & UnicodeSet
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure,
+            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
-            if (U_FAILURE(ec)) return;
+            U_UNICODESET_RETURN_IF_ERROR(ec);
             retainAll(rightHandSide);
-        } else if (!escaped && c == u'-') {
+        } else if (!escaped && op == u'-') {
             // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
             // of a Difference or an UnescapedHyphenMinus in the enclosing Union.
             RuleCharacterIterator::Pos afterOperator;
             chars.getPos(afterOperator);
-            char16_t c = chars.next(charsOptions(options), escaped, ec);
-            if (U_FAILURE(ec)) return;
-            if (!escaped && c == u']') {
+            const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
+            U_UNICODESET_RETURN_IF_ERROR(ec);
+            if (!escaped && ahead == u']') {
                 // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it.
                 chars.setPos(beforeOperator);
                 return;
@@ -440,12 +521,12 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
             chars.setPos(afterOperator);
             // Difference ::= Restriction - UnicodeSet
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure,
+            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
-            if (U_FAILURE(ec)) return;
+            U_UNICODESET_RETURN_IF_ERROR(ec);
             removeAll(rightHandSide);
         } else {
-            // Not an operator.
+            // Not an operator, end of the Restriction.
             chars.setPos(beforeOperator);
             return;
         }
@@ -459,8 +540,89 @@ void UnicodeSet::parseElements(RuleCharacterIterator &chars,
                                UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                int32_t depth,
                                UErrorCode &ec) {
+    U_UNICODESET_TRACE();
+    // Elements     ::= Element
+    //                | Range
+    // Range        ::= RangeElement - RangeElement
+    // RangeElement ::= literal-element
+    //                | escaped-element
+    // Element      ::= RangeElement
+    //                | string-literal
     UBool escaped = false;
-    
+    const UChar32 first = chars.next(charsOptions(options), escaped, ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
+    if (!escaped) {
+        switch (first) {
+        case u'-':
+        case u'&':
+        case u'[':
+        case u']':
+        case u'^':
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec);
+        case u'{': {
+            UnicodeString string;
+            UChar32 c;
+            while (!chars.atEnd()) {
+                c = chars.next(charsOptions(options), escaped, ec);
+                U_UNICODESET_RETURN_IF_ERROR(ec);
+                if (!escaped && c == u'}') {
+                    add(string);
+                    return;
+                }
+                string.append(c);
+            }
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec);
+        }
+        case u'}':
+            // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
+        default:
+            break;
+        }
+    }
+    RuleCharacterIterator::Pos beforeOperator;
+    chars.getPos(beforeOperator);
+    const UChar32 op = chars.next(charsOptions(options), escaped, ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
+    if (escaped || op != u'-') {
+        // No operator,
+        // Elements ::= Element
+        chars.setPos(beforeOperator);
+        add(first);
+        return;
+    }
+    // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
+    // of a Range or an UnescapedHyphenMinus in the enclosing Union.
+    const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
+    if (!escaped && ahead == u']') {
+        // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it.
+        chars.setPos(beforeOperator);
+        add(first);
+        return;
+    }
+    const UChar32 last = ahead;
+    U_UNICODESET_RETURN_IF_ERROR(ec);
+    if (!escaped) {
+        switch (last) {
+        case u'-':
+        case u'&':
+        case u'[':
+        case u']':
+        case u'^':
+        case u'{':
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec);
+        case u'}':
+            // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
+        default:
+            break;
+        }
+    }
+    if (last <= first) {
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range",
+                                 UnicodeString(last) + u"-" + UnicodeString(first), chars, ec);
+    }
+    add(first, last);
+    return;
 }
 
     #if 0

From 07ab1c1c4791655c46e02449fc3e44935822b081 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 04/56] ICU-22851 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 44 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 45 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index da32687987e8..fa9c00897865 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,49 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto [expression, expected] :
+         std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+             // For a UnicodeSet which is not a property-query nor a named-element and without any
+             // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+             // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+             // minimize the result.
+             {u"[c-za-b]", u"[a-z]"},
+             {u"[  c - z  a - b  ]", u"[a-z]"},
+             {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+             {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+             {u"[ - - ]", uR"([\-])"},
+             {u"[ - _ - ]", uR"([\-_])"},
+             {u"[ - + - ]", uR"([+\-])"},
+             // A property-query or named-element is kept as-is:
+             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+             {uR"(\p{P})", uR"(\p{P})"},
+             {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+             {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+             {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+             {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+             {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+             {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+              uR"([c-za-b\p{ General_Category = Punctuation }])"},
+             {u"[^[c]]", uR"([^[c]])"},
+             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
              // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From b489fa0622a03a021877268197d95427bbe160a4 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 23:10:20 +0200
Subject: [PATCH 05/56] call it a day

---
 icu4c/source/common/unicode/uniset.h   | 18 +++++---
 icu4c/source/common/uniset_closure.cpp |  2 +-
 icu4c/source/common/uniset_props.cpp   | 58 +++++++++++++++++---------
 3 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index c4c96154fca2..96a9f4f9f749 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1696,7 +1696,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                                  const SymbolTable* symbols,
                                  UErrorCode& status);
 
-    void applyPattern(RuleCharacterIterator &chars,
+    void applyPattern(const UnicodeString &pattern,
+                      RuleCharacterIterator &chars,
                       const SymbolTable *symbols,
                       UnicodeString &rebuiltPat,
                       uint32_t options,
@@ -1708,7 +1709,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
     // applied).  They add to *this the elements of the set that the parsed construct represents.
     // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
 
-    void parseUnicodeSet(RuleCharacterIterator &chars,
+    void parseUnicodeSet(const UnicodeString &pattern,
+                         RuleCharacterIterator &chars,
                          const SymbolTable *symbols,
                          UnicodeString &rebuiltPat,
                          uint32_t options,
@@ -1716,7 +1718,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                          int32_t depth,
                          UErrorCode &ec);
 
-    void parseUnion(RuleCharacterIterator &chars,
+    void parseUnion(const UnicodeString &pattern,
+                    RuleCharacterIterator &chars,
                     const SymbolTable *symbols,
                     UnicodeString &rebuiltPat,
                     uint32_t options,
@@ -1724,7 +1727,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                     int32_t depth,
                     UErrorCode &ec);
 
-    void parseTerm(RuleCharacterIterator &chars,
+    void parseTerm(const UnicodeString &pattern,
+                   RuleCharacterIterator &chars,
                    const SymbolTable *symbols,
                    UnicodeString &rebuiltPat,
                    uint32_t options,
@@ -1732,7 +1736,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                    int32_t depth,
                    UErrorCode &ec);
 
-    void parseRestriction(RuleCharacterIterator &chars,
+    void parseRestriction(const UnicodeString &pattern,
+                          RuleCharacterIterator &chars,
                           const SymbolTable *symbols,
                           UnicodeString &rebuiltPat,
                           uint32_t options,
@@ -1740,7 +1745,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                           int32_t depth,
                           UErrorCode &ec);
 
-    void parseElements(RuleCharacterIterator &chars,
+    void parseElements(const UnicodeString &pattern,
+                       RuleCharacterIterator &chars,
                        const SymbolTable *symbols,
                        UnicodeString &rebuiltPat,
                        uint32_t options,
diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp
index 251276adaddb..05e9b0a37e04 100644
--- a/icu4c/source/common/uniset_closure.cpp
+++ b/icu4c/source/common/uniset_closure.cpp
@@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
+    applyPattern(pattern, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
     if (U_FAILURE(status)) return *this;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 054a1a932d03..fa3c9070831b 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
+    applyPattern(pattern, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
     if (U_FAILURE(status)) return;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
@@ -296,10 +296,12 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
 
 #define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
     do {                                                                                                \
+    constexpr std::string_view functionName = __func__;\
+    static_assert (functionName.substr(0, 5) == "parse");\
         if (U_FAILURE(ec)) {                                                                            \
             if (depth < 5) {                                                                            \
-                printf("--- at %s l. %d\n", __func__, __LINE__);                                        \
-            } else if (depth == 5 && std::string_view(__func__) == "parseUnicodeSet") {                 \
+                printf("--- in %s l. %d\n", __func__+5, __LINE__);                                        \
+            } else if (depth == 5 && std::string_view(__func__+5) == "UnicodeSet") {                 \
                 printf("--- [...]\n");                                                                  \
             }                                                                                           \
             return;                                                                                     \
@@ -307,13 +309,22 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
     } while (false)
 #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec)                               \
     do {                                                                                                \
+    constexpr std::string_view functionName = __func__;                                             \
+        static_assert(functionName.substr(0, 5) == "parse");\
         std::string actualUTF8;                                                                         \
         UnicodeString ahead;                                                                            \
         std::string aheadUTF8;                                                                          \
-        printf("*** Expected %s, got '%s' %s\n", (expected),                                            \
+        std::string behindUTF8;                                                                          \
+        (chars).lookahead(ahead); \
+        printf("*** Expected %s, got '%s' %s☜%s\n", (expected),                                            \
                UnicodeString(actual).toUTF8String(actualUTF8).c_str(),                                  \
-               (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str());                           \
-        printf("--- at %s l. %d\n", __func__, __LINE__);                                                \
+               pattern.tempSubString(0, pattern.length() - ahead.length())                              \
+                   .toUTF8String(behindUTF8)                                                            \
+                   .c_str(),                                                                            \
+               pattern.tempSubString(pattern.length() - ahead.length(), 60)                              \
+                   .toUTF8String(aheadUTF8)                                                             \
+                   .c_str());                           \
+        printf("--- in %s l. %d\n", __func__ + 5, __LINE__);                                                \
         (ec) = U_MALFORMED_SET;                                                                         \
         return;                                                                                         \
     } while (false)
@@ -323,6 +334,7 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
 /**
  * Parse the pattern from the given RuleCharacterIterator.  The
  * iterator is advanced over the parsed pattern.
+ * @param pattern The pattern, only used by debug traces.
  * @param chars iterator over the pattern characters.  Upon return
  * it will be advanced to the first character after the parsed
  * pattern, or the end of the iteration if all characters are
@@ -335,7 +347,8 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
  * IGNORE_SPACE, CASE.
  */
 
-void UnicodeSet::applyPattern(RuleCharacterIterator &chars,
+void UnicodeSet::applyPattern(const UnicodeString &pattern,
+                              RuleCharacterIterator &chars,
                               const SymbolTable *symbols,
                               UnicodeString &rebuiltPat,
                               uint32_t options,
@@ -343,11 +356,12 @@ void UnicodeSet::applyPattern(RuleCharacterIterator &chars,
                               UErrorCode &ec) {
     if (U_FAILURE(ec)) return;
     clear();
-    parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
+    parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
     _generatePattern(rebuiltPat, false);
 }
 
-void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars,
+void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
+                                 RuleCharacterIterator &chars,
                                  const SymbolTable* symbols,
                                  UnicodeString& rebuiltPat,
                                  uint32_t options,
@@ -388,7 +402,7 @@ void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars,
         } else {
             chars.setPos(afterBracket);
         }
-        parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseUnion(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
@@ -411,7 +425,8 @@ void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars,
     }
 }
 
-void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
+void UnicodeSet::parseUnion(const UnicodeString &pattern,
+                            RuleCharacterIterator &chars,
                             const SymbolTable *symbols,
                             UnicodeString &rebuiltPat,
                             uint32_t options,
@@ -448,12 +463,13 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars,
         if (!escaped && c == ']') {
             return;
         }
-        parseTerm(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
-void UnicodeSet::parseTerm(RuleCharacterIterator &chars,
+void UnicodeSet::parseTerm(const UnicodeString &pattern,
+                           RuleCharacterIterator &chars,
                            const SymbolTable *symbols,
                            UnicodeString &rebuiltPat,
                            uint32_t options,
@@ -469,15 +485,16 @@ void UnicodeSet::parseTerm(RuleCharacterIterator &chars,
     const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
     chars.setPos(termStart);
     if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
-        parseRestriction(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     } else {
-        parseElements(chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseElements(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
-void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
+void UnicodeSet::parseRestriction(const UnicodeString &pattern,
+                                  RuleCharacterIterator &chars,
                                   const SymbolTable *symbols,
                                   UnicodeString &rebuiltPat,
                                   uint32_t options,
@@ -489,7 +506,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
     //               | Intersection ::= Restriction & UnicodeSet
     //               | Difference   ::= Restriction - UnicodeSet
     // Start by parsing the first UnicodeSet.
-    parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec);
+    parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec);
     U_UNICODESET_RETURN_IF_ERROR(ec);
     // Now keep looking for an operator that would continue the Restriction.
     // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
@@ -502,7 +519,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
         if (!escaped && op == u'&') {
             // Intersection ::= Restriction & UnicodeSet
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure,
+            rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             retainAll(rightHandSide);
@@ -521,7 +538,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
             chars.setPos(afterOperator);
             // Difference ::= Restriction - UnicodeSet
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure,
+            rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             removeAll(rightHandSide);
@@ -533,7 +550,8 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars,
     }
 }
 
-void UnicodeSet::parseElements(RuleCharacterIterator &chars,
+void UnicodeSet::parseElements(const UnicodeString &pattern,
+                               RuleCharacterIterator &chars,
                                const SymbolTable *symbols,
                                UnicodeString &rebuiltPat,
                                uint32_t options,

From e147b1564bc1ed06a1d48fcfa95d8a0fdda9d5ad Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 14:39:35 +0200
Subject: [PATCH 06/56] Pattern-rebuilding logic

---
 icu4c/source/common/unicode/uniset.h |  2 ++
 icu4c/source/common/uniset_props.cpp | 48 +++++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 96a9f4f9f749..2d73df2fcdac 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1725,6 +1725,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                     uint32_t options,
                     UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                     int32_t depth,
+                    bool &containsRestrictions,
                     UErrorCode &ec);
 
     void parseTerm(const UnicodeString &pattern,
@@ -1734,6 +1735,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                    uint32_t options,
                    UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                    int32_t depth,
+                   bool &containsRestriction,
                    UErrorCode &ec);
 
     void parseRestriction(const UnicodeString &pattern,
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index fa3c9070831b..450c93712520 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -355,9 +355,7 @@ void UnicodeSet::applyPattern(const UnicodeString &pattern,
                               UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                               UErrorCode &ec) {
     if (U_FAILURE(ec)) return;
-    clear();
     parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
-    _generatePattern(rebuiltPat, false);
 }
 
 void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
@@ -367,6 +365,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
                                  uint32_t options,
                                  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
                                  int32_t depth, UErrorCode &ec) {
+    clear();
     U_UNICODESET_TRACE();
 
     if (depth > MAX_DEPTH) {
@@ -375,14 +374,21 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
     }
 
     bool isComplement = false;
+    // Whether to keep the syntax of the pattern at this level, only doing basic pretty-printing, e.g.,
+    // turn [ c - z[a]a - b ] into [c-z[a]a-b], but not into [a-z].
+    // This is true for a property query, or when there is a nested set.  Note that since we recurse,
+    // innermost sets consisting only of ranges will get simplified.
+    bool preserveSyntaxInPattern = false;
+    UnicodeString syntacticallyFaithfulPattern;
     if (resemblesPropertyPattern(chars, charsOptions(options))) {
         // UnicodeSet ::= property-query | named-element
         U_UNICODESET_TRACE("property-query | named-element");
         chars.skipIgnored(charsOptions(options));
         UnicodeSet propertyQuery;
-        propertyQuery.applyPropertyPattern(chars, rebuiltPat, ec);
+        propertyQuery.applyPropertyPattern(chars, syntacticallyFaithfulPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         addAll(propertyQuery);
+        preserveSyntaxInPattern = true;
     } else {
         UBool escaped = false;
         // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
@@ -393,22 +399,26 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         if (escaped || c != u'[') {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
         }
+        syntacticallyFaithfulPattern.append(u'[');
         RuleCharacterIterator::Pos afterBracket;
         chars.getPos(afterBracket);
         c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         if (!escaped && c == u'^') {
+            syntacticallyFaithfulPattern.append(u'^');
             isComplement = true;
         } else {
             chars.setPos(afterBracket);
         }
-        parseUnion(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseUnion(pattern, chars, symbols, syntacticallyFaithfulPattern, options, caseClosure, depth,
+                   /*containsRestrictions=*/preserveSyntaxInPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         if (escaped || c != u']') {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
         }
+        syntacticallyFaithfulPattern.append(u']');
     }
 
     /**
@@ -423,6 +433,11 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
     if (isComplement) {
         complement().removeAllStrings();  // code point complement
     }
+    if (preserveSyntaxInPattern) {
+        rebuiltPat.append(syntacticallyFaithfulPattern);
+    } else {
+        _generatePattern(rebuiltPat, /*escapeUnprintable=*/false);
+    }
 }
 
 void UnicodeSet::parseUnion(const UnicodeString &pattern,
@@ -430,7 +445,9 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
                             const SymbolTable *symbols,
                             UnicodeString &rebuiltPat,
                             uint32_t options,
-                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth,
+                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
+                            int32_t depth,
+                            bool &containsRestrictions,
                             UErrorCode &ec) {
     U_UNICODESET_TRACE();
     UBool escaped = false;
@@ -446,6 +463,9 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
     U_UNICODESET_RETURN_IF_ERROR(ec);
     if (!escaped && c == u'-') {
         add(u'-');
+        // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
+        // final one, for consistency with older ICU behaviour.
+        rebuiltPat.append(u"\\-");
     } else {
         chars.setPos(position);
     }
@@ -456,6 +476,7 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
         if (!escaped && c == u'-') {
             // We can be here on the first iteration: [--] is allowed by the
             // grammar and by the old parser.
+            rebuiltPat.append(u'-');
             add(u'-');
             return;
         }
@@ -463,7 +484,8 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
         if (!escaped && c == ']') {
             return;
         }
-        parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions,
+                  ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
@@ -475,6 +497,7 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern,
                            uint32_t options,
                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                            int32_t depth,
+                           bool &containsRestriction,
                            UErrorCode &ec) {
     U_UNICODESET_TRACE();
     UBool escaped = false;
@@ -485,6 +508,7 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern,
     const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
     chars.setPos(termStart);
     if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
+        containsRestriction = true;
         parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     } else {
@@ -506,7 +530,10 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern,
     //               | Intersection ::= Restriction & UnicodeSet
     //               | Difference   ::= Restriction - UnicodeSet
     // Start by parsing the first UnicodeSet.
-    parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec);
+    UnicodeSet leftHandSide;
+    leftHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1,
+                                 ec);
+    addAll(leftHandSide);
     U_UNICODESET_RETURN_IF_ERROR(ec);
     // Now keep looking for an operator that would continue the Restriction.
     // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
@@ -518,6 +545,7 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern,
         U_UNICODESET_RETURN_IF_ERROR(ec);
         if (!escaped && op == u'&') {
             // Intersection ::= Restriction & UnicodeSet
+            rebuiltPat.append(u'&');
             UnicodeSet rightHandSide;
             rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
@@ -537,6 +565,7 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern,
             }
             chars.setPos(afterOperator);
             // Difference ::= Restriction - UnicodeSet
+            rebuiltPat.append(u'-');
             UnicodeSet rightHandSide;
             rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
                                           depth + 1, ec);
@@ -597,6 +626,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
             break;
         }
     }
+    _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false);
     RuleCharacterIterator::Pos beforeOperator;
     chars.getPos(beforeOperator);
     const UChar32 op = chars.next(charsOptions(options), escaped, ec);
@@ -618,8 +648,9 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
         add(first);
         return;
     }
+    // Elements ::= Range ::= RangeElement - RangeElement
+    rebuiltPat.append(u'-');
     const UChar32 last = ahead;
-    U_UNICODESET_RETURN_IF_ERROR(ec);
     if (!escaped) {
         switch (last) {
         case u'-':
@@ -635,6 +666,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
             break;
         }
     }
+    _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false);
     if (last <= first) {
         U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range",
                                  UnicodeString(last) + u"-" + UnicodeString(first), chars, ec);

From f47cac412d7ea174fc336e1ab28a8dd0c1623fa8 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 14:48:52 +0200
Subject: [PATCH 07/56] More tests of toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index fa9c00897865..0262aca5b0ca 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4351,11 +4351,14 @@ void UnicodeSetTest::TestToPatternOutput() {
              {u"[ - - ]", uR"([\-])"},
              {u"[ - _ - ]", uR"([\-_])"},
              {u"[ - + - ]", uR"([+\-])"},
+             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
              // A property-query or named-element is kept as-is:
              {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
              {uR"(\p{P})", uR"(\p{P})"},
              {uR"(\p{gc=P})", uR"(\p{gc=P})"},
              {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
              {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
              {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
              // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
@@ -4367,6 +4370,14 @@ void UnicodeSetTest::TestToPatternOutput() {
               uR"([c-za-b\p{ General_Category = Punctuation }])"},
              {u"[^[c]]", uR"([^[c]])"},
              {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             // Spaces are eliminated within a string-literal even when the syntax is preserved.
+             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+             // Escapes are removed even when the syntax is preserved.
+             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+              u"[{Zeichenkette}[]Zeichenmenge]"},
+             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+             // preserved.
+             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);

From e3efa59d4940d63b3280cca3124de39aba4b9709 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 08/56] ICU-22851 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 56 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index da32687987e8..0262aca5b0ca 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto [expression, expected] :
+         std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+             // For a UnicodeSet which is not a property-query nor a named-element and without any
+             // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+             // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+             // minimize the result.
+             {u"[c-za-b]", u"[a-z]"},
+             {u"[  c - z  a - b  ]", u"[a-z]"},
+             {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+             {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+             {u"[ - - ]", uR"([\-])"},
+             {u"[ - _ - ]", uR"([\-_])"},
+             {u"[ - + - ]", uR"([+\-])"},
+             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             // A property-query or named-element is kept as-is:
+             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+             {uR"(\p{P})", uR"(\p{P})"},
+             {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+             {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+             {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+             {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+             {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+             {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+              uR"([c-za-b\p{ General_Category = Punctuation }])"},
+             {u"[^[c]]", uR"([^[c]])"},
+             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             // Spaces are eliminated within a string-literal even when the syntax is preserved.
+             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+             // Escapes are removed even when the syntax is preserved.
+             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+              u"[{Zeichenkette}[]Zeichenmenge]"},
+             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+             // preserved.
+             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
              // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From a7a403581e2056450fad2ed77bf00f75d14b29f8 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:01:03 +0200
Subject: [PATCH 09/56] Print strings

---
 icu4c/source/common/uniset_props.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 450c93712520..562ce16db9f0 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -379,13 +379,14 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
     // This is true for a property query, or when there is a nested set.  Note that since we recurse,
     // innermost sets consisting only of ranges will get simplified.
     bool preserveSyntaxInPattern = false;
-    UnicodeString syntacticallyFaithfulPattern;
+    // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc.
+    UnicodeString prettyPrintedPattern;
     if (resemblesPropertyPattern(chars, charsOptions(options))) {
         // UnicodeSet ::= property-query | named-element
         U_UNICODESET_TRACE("property-query | named-element");
         chars.skipIgnored(charsOptions(options));
         UnicodeSet propertyQuery;
-        propertyQuery.applyPropertyPattern(chars, syntacticallyFaithfulPattern, ec);
+        propertyQuery.applyPropertyPattern(chars, prettyPrintedPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         addAll(propertyQuery);
         preserveSyntaxInPattern = true;
@@ -399,18 +400,18 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         if (escaped || c != u'[') {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
         }
-        syntacticallyFaithfulPattern.append(u'[');
+        prettyPrintedPattern.append(u'[');
         RuleCharacterIterator::Pos afterBracket;
         chars.getPos(afterBracket);
         c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         if (!escaped && c == u'^') {
-            syntacticallyFaithfulPattern.append(u'^');
+            prettyPrintedPattern.append(u'^');
             isComplement = true;
         } else {
             chars.setPos(afterBracket);
         }
-        parseUnion(pattern, chars, symbols, syntacticallyFaithfulPattern, options, caseClosure, depth,
+        parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth,
                    /*containsRestrictions=*/preserveSyntaxInPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         c = chars.next(charsOptions(options), escaped, ec);
@@ -418,7 +419,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         if (escaped || c != u']') {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
         }
-        syntacticallyFaithfulPattern.append(u']');
+        prettyPrintedPattern.append(u']');
     }
 
     /**
@@ -434,7 +435,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         complement().removeAllStrings();  // code point complement
     }
     if (preserveSyntaxInPattern) {
-        rebuiltPat.append(syntacticallyFaithfulPattern);
+        rebuiltPat.append(prettyPrintedPattern);
     } else {
         _generatePattern(rebuiltPat, /*escapeUnprintable=*/false);
     }
@@ -607,15 +608,18 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
         case u'^':
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec);
         case u'{': {
+            rebuiltPat.append(u'{');
             UnicodeString string;
             UChar32 c;
             while (!chars.atEnd()) {
                 c = chars.next(charsOptions(options), escaped, ec);
                 U_UNICODESET_RETURN_IF_ERROR(ec);
                 if (!escaped && c == u'}') {
+                    rebuiltPat.append(u'}');
                     add(string);
                     return;
                 }
+                _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false);
                 string.append(c);
             }
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec);

From cef298e85d270f946da33c7a64d787c64eb4a004 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:03:59 +0200
Subject: [PATCH 10/56] Appease the warnings even though these are string_views

---
 icu4c/source/test/intltest/usettest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 0262aca5b0ca..3b0e1dc32fe1 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4338,7 +4338,7 @@ void UnicodeSetTest::TestElementIterator() {
 }
 
 void UnicodeSetTest::TestToPatternOutput() {
-    for (const auto [expression, expected] :
+    for (const auto &[expression, expected] :
          std::vector<std::pair<std::u16string_view, std::u16string_view>>{
              // For a UnicodeSet which is not a property-query nor a named-element and without any
              // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements

From b4e365b91e47287f320e2eeb3fdd675cf62992e1 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 11/56] ICU-22851 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 56 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index da32687987e8..3b0e1dc32fe1 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto &[expression, expected] :
+         std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+             // For a UnicodeSet which is not a property-query nor a named-element and without any
+             // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+             // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+             // minimize the result.
+             {u"[c-za-b]", u"[a-z]"},
+             {u"[  c - z  a - b  ]", u"[a-z]"},
+             {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+             {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+             {u"[ - - ]", uR"([\-])"},
+             {u"[ - _ - ]", uR"([\-_])"},
+             {u"[ - + - ]", uR"([+\-])"},
+             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             // A property-query or named-element is kept as-is:
+             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+             {uR"(\p{P})", uR"(\p{P})"},
+             {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+             {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+             {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+             {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+             {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+             {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+              uR"([c-za-b\p{ General_Category = Punctuation }])"},
+             {u"[^[c]]", uR"([^[c]])"},
+             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             // Spaces are eliminated within a string-literal even when the syntax is preserved.
+             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+             // Escapes are removed even when the syntax is preserved.
+             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+              u"[{Zeichenkette}[]Zeichenmenge]"},
+             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+             // preserved.
+             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
              // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From cef20932f17429a5dab01a14db63266577b075b7 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:51:47 +0200
Subject: [PATCH 12/56] ICU-22851 Test various edge cases with $ in the absence
 of variables

---
 icu4c/source/test/intltest/usettest.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 3b0e1dc32fe1..55a23782337a 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4353,6 +4353,10 @@ void UnicodeSetTest::TestToPatternOutput() {
              {u"[ - + - ]", uR"([+\-])"},
              {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
              {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             {u"[$d-za-c]", uR"([\$a-z])"},
+             {u"[a-c$d-z]", uR"([\$a-z])"},
+             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+             {u"[!-$z]", uR"([!-\$z])"},
              // A property-query or named-element is kept as-is:
              {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
              {uR"(\p{P})", uR"(\p{P})"},
@@ -4378,6 +4382,8 @@ void UnicodeSetTest::TestToPatternOutput() {
              // A named-element is currently a nested set, so it is preserved and causes the syntax to be
              // preserved.
              {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+             // An anchor also causes the syntax to be preserved.
+             {u"[ d-z a-c $ ]", u"[d-za-c$]"},
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4416,6 +4422,7 @@ void UnicodeSetTest::TestParseErrors() {
              u"[{aa]",
              // "Unquoted '$'".
              u"[a-$]",
+             u"[!-$]",
              // "Invalid range".
              u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
              u"[z-a]",

From 9e126dda9735aef35de47882c4bbaca791eaed65 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 16:13:23 +0200
Subject: [PATCH 13/56] $ handling

---
 icu4c/source/common/uniset_props.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 562ce16db9f0..6054c32c2283 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -480,6 +480,18 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
             rebuiltPat.append(u'-');
             add(u'-');
             return;
+        } else if (!escaped && c == u'$') {
+            RuleCharacterIterator::Pos afterDollar;
+            chars.getPos(afterDollar);
+            c = chars.next(charsOptions(options), escaped, ec);
+            if (!escaped && c == u']') {
+                // An unescaped $ at the end of a Union is an anchor.
+                rebuiltPat.append(u'$');
+                chars.setPos(afterDollar);
+                add(U_ETHER);
+                containsRestrictions = true;
+                return;
+            }
         }
         chars.setPos(position);
         if (!escaped && c == ']') {
@@ -607,6 +619,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
         case u']':
         case u'^':
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec);
+            // Unescaped '$'
         case u'{': {
             rebuiltPat.append(u'{');
             UnicodeString string;
@@ -625,6 +638,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec);
         }
         case u'}':
+        case u'$':
             // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
         default:
             break;
@@ -664,6 +678,19 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
         case u'^':
         case u'{':
             U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec);
+        case u'$': {
+            // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union.
+            // This is an extension.
+            RuleCharacterIterator::Pos afterDollar;
+            chars.getPos(afterDollar);
+            UChar32 c = chars.next(charsOptions(options), escaped, ec);
+            chars.setPos(afterDollar);
+            if (!escaped && c == u']') {
+                U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", c, chars,
+                                                     ec);
+            }
+            break;
+        }
         case u'}':
             // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
         default:

From c8d2b9eb852ceb1367d809b858e30b691ed81ab7 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 16:15:01 +0200
Subject: [PATCH 14/56] comment

---
 icu4c/source/common/uniset_props.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 6054c32c2283..3984ee788ee8 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -485,7 +485,8 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
             chars.getPos(afterDollar);
             c = chars.next(charsOptions(options), escaped, ec);
             if (!escaped && c == u']') {
-                // An unescaped $ at the end of a Union is an anchor.
+                // ICU extensions: A $ is allowed as a literal-element.
+                // A Term at the end of a Union consisting of a single $ is an anchor.
                 rebuiltPat.append(u'$');
                 chars.setPos(afterDollar);
                 add(U_ETHER);

From bbcc2316c31339d198fbc501823948c7b2302aa0 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 17:07:35 +0200
Subject: [PATCH 15/56] ICU-22851 Even more $ edge cases

---
 icu4c/source/test/intltest/usettest.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 55a23782337a..5415940918ad 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4357,6 +4357,8 @@ void UnicodeSetTest::TestToPatternOutput() {
              {u"[a-c$d-z]", uR"([\$a-z])"},
              {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
              {u"[!-$z]", uR"([!-\$z])"},
+             {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+             {u"[-$-]", uR"([\$\-])"},
              // A property-query or named-element is kept as-is:
              {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
              {uR"(\p{P})", uR"(\p{P})"},
@@ -4374,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() {
               uR"([c-za-b\p{ General_Category = Punctuation }])"},
              {u"[^[c]]", uR"([^[c]])"},
              {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             {u"[$[]]", uR"([\$[]])"},
              // Spaces are eliminated within a string-literal even when the syntax is preserved.
              {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
              // Escapes are removed even when the syntax is preserved.
@@ -4384,6 +4387,8 @@ void UnicodeSetTest::TestToPatternOutput() {
              {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
              // An anchor also causes the syntax to be preserved.
              {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+             {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+             {u"[$$$]", uR"([\$\$$])"},
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);

From 876d338542643b2607d58922ae6f748331d544a3 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:51:47 +0200
Subject: [PATCH 16/56] ICU-22851 Test various edge cases with $ in the absence
 of variables

---
 icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 3b0e1dc32fe1..5415940918ad 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() {
              {u"[ - + - ]", uR"([+\-])"},
              {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
              {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             {u"[$d-za-c]", uR"([\$a-z])"},
+             {u"[a-c$d-z]", uR"([\$a-z])"},
+             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+             {u"[!-$z]", uR"([!-\$z])"},
+             {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+             {u"[-$-]", uR"([\$\-])"},
              // A property-query or named-element is kept as-is:
              {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
              {uR"(\p{P})", uR"(\p{P})"},
@@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() {
               uR"([c-za-b\p{ General_Category = Punctuation }])"},
              {u"[^[c]]", uR"([^[c]])"},
              {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             {u"[$[]]", uR"([\$[]])"},
              // Spaces are eliminated within a string-literal even when the syntax is preserved.
              {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
              // Escapes are removed even when the syntax is preserved.
@@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() {
              // A named-element is currently a nested set, so it is preserved and causes the syntax to be
              // preserved.
              {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+             // An anchor also causes the syntax to be preserved.
+             {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+             {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+             {u"[$$$]", uR"([\$\$$])"},
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() {
              u"[{aa]",
              // "Unquoted '$'".
              u"[a-$]",
+             u"[!-$]",
              // "Invalid range".
              u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
              u"[z-a]",

From a6d9182ebab5f44249434128ef9c9627c817d2cc Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 20:53:20 +0200
Subject: [PATCH 17/56] ICU-22851 Test UnicodeSet with lookupMatcher

---
 icu4c/source/test/intltest/usettest.cpp | 203 +++++++++++++++++++++++-
 icu4c/source/test/intltest/usettest.h   |   1 +
 2 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..06de9e315aac 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include <array>
 #include <string_view>
 #include <unordered_map>
 
@@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestEscapePattern);
     TESTCASE_AUTO(TestInvalidCodePoint);
     TESTCASE_AUTO(TestSymbolTable);
+    TESTCASE_AUTO(TestLookupSymbolTable);
     TESTCASE_AUTO(TestSurrogate);
     TESTCASE_AUTO(TestPosixClasses);
     TESTCASE_AUTO(TestIteration);
@@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() {
     // Multiple test cases can be set up here.  Each test case
     // is terminated by null:
     // var, value, var, value,..., input pat., exp. output pat., null
-    const char* DATA[] = {
+    const char *DATA[] = {
         "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr,
         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr,
         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr,
+        // Things that probably should not work, but currently do:
+        "open", "[", "$open a-z]", "[a-z]", nullptr,
+        "open", "[", "close", "]", "hyphenMinus", "-",
+            "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]",
+            "[[a-z]-[c-z]-]", nullptr,
+        "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr,
+        "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr,
+        "smiling", ":-]", "laughing", ":-D",
+            "[ {$smiling} $laughing $smiling",
+            R"([\-\:-D{\:\-\]}])", nullptr,
         nullptr
     };
 
@@ -1811,6 +1823,195 @@ void UnicodeSetTest::TestSymbolTable() {
             logln(UnicodeString("Ok, got ") + us.toPattern(a, true));
         }
     }
+    for (const auto &[variables, expression, expectedErrorCode, expectedPattern] :
+         std::vector<std::tuple<std::vector<std::pair<std::u16string_view, std::u16string_view>>,
+                                std::u16string_view, UErrorCode, std::u16string_view>>{
+             // You should not do this, but it works.
+             {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}},
+              u"$privateUseOrUnassigned$close",
+              U_ZERO_ERROR,
+              u"[[:Co:][:Cn:]]"},
+             // This works and it is fine.
+             {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"},
+             // This should work! But it does not. Note the doubled brackets on the one that works above.
+             // We are not yet inside the variable when we call lookahead(), so we try to parse
+             // $privateUse rather than [:Co:].
+             {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"},
+             // This should not work, and it does not (we try to parse [$sad$surprised] as a
+             // property-query).
+             {{{u"sad", u":C"}, {u"surprised", u"o:"}},
+              u"[$sad$surprised]",
+              U_ILLEGAL_ARGUMENT_ERROR,
+              u"[]"},
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        TokenSymbolTable symbols(errorCode);
+        if (U_FAILURE(errorCode)) {
+            errln("FAIL: Couldn’t construct symbol table");
+            continue;
+        }
+        for (const auto &[name, value] : variables) {
+            symbols.add(name, value, errorCode);
+            if (U_FAILURE(errorCode)) {
+                errln("FAIL: Couldn’t add variable " + name);
+                continue;
+            }
+        }
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        UnicodeString actual;
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+    }
+}
+
+void UnicodeSetTest::TestLookupSymbolTable() {
+    UErrorCode errorCode = U_ZERO_ERROR;
+    class TestSymbolTable : public SymbolTable {
+      public:
+        const UnicodeString *lookup(const UnicodeString &) const override {
+            return nullptr;
+        }
+
+        const UnicodeFunctor *lookupMatcher(UChar32 c) const override {
+            return symbols_.find(c) != symbols_.end() ? &symbols_.at(c)
+                                                                    : nullptr;
+        }
+
+        virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &,
+                                             int32_t) const override {
+            return u"";
+        }
+
+        void add(UChar32 c, UnicodeSet set) {
+            symbols_[c] = set;
+        }
+
+      private:
+        std::unordered_map<UChar32, UnicodeSet> symbols_;
+    };
+    TestSymbolTable symbols;
+    symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
+    symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
+    symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+         std::vector<std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
+             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+             {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
+             {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+             // Substitution of lookupMatcher symbols takes place after de-escaping.
+             {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+             // It does not take place in string literals.
+             {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
+             {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
+             {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
+              u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
+             {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+         }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
+    // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
+    // change it unknowingly.
+    symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode));
+    symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode));
+    // This one is never used, except if escaped.
+    symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode));
+    symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode));
+    symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
+    symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
+    symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+         std::vector<
+             std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
+             {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
+             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+             // The hyphen no longer works as set difference.
+             {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
+             {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
+             // String literals no longer work.
+             {uR"([!-/{0}])", U_ZERO_ERROR,
+              u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
+              u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
+             // The ampersand no longer works as set difference.
+             {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
+              u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
+              // Complementing still works.
+             {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
+              uR"([\u0001-\U0010FFFF])"},
+              // ^ elsewhere becomes a symbol rather than a syntax error.
+             {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
+              uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
+             // Opening brackets still work.
+             {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
+             // The only way to access the [ symbol is via escaping.
+             {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
+             // Anchors are gone.
+             {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
+         }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
+    // constructor returns an error but not an empty set. Don’t do that.
+    symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+         std::vector<
+             std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
+             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+         }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
 }
 
 void UnicodeSetTest::TestSurrogate() {
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..32abf828a30a 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest {
     void TestInvalidCodePoint();
 
     void TestSymbolTable();
+    void TestLookupSymbolTable();
 
     void TestSurrogate();
 

From e81735cc26740ebf9e2620354de2b24c91f76767 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Fri, 15 Aug 2025 16:57:09 +0200
Subject: [PATCH 18/56] Something that works in the same silly way as it used
 to.

---
 icu4c/source/common/uniset_props.cpp | 114 +++++++++++++++++----------
 1 file changed, 72 insertions(+), 42 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 3984ee788ee8..8c4b13f18e71 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -250,6 +250,13 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
     return opts;
 }
 
+const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar32 c) {
+    if (symbols == nullptr) {
+      return nullptr;
+    }
+    return dynamic_cast<const UnicodeSet *>(symbols->lookupMatcher(c));
+}
+
 #if 0
 #define U_UNICODESET_TRACE(...)                                                                         \
     struct UnicodeSetParserTrace {                                                                      \
@@ -395,31 +402,43 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
         // UnicodeSet ::=                [   Union ]
         //              | Complement ::= [ ^ Union ]
+        // Extension:
+        //              | MatcherSymbol
+        // Where a MatcherSymbol may be a character or an escape.
+        // Strings that would match MatcherSymbol effectively get removed from
+        // all other terminals of the grammar, except [.
         UChar32 c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (escaped || c != u'[') {
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
-        }
-        prettyPrintedPattern.append(u'[');
-        RuleCharacterIterator::Pos afterBracket;
-        chars.getPos(afterBracket);
-        c = chars.next(charsOptions(options), escaped, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (!escaped && c == u'^') {
-            prettyPrintedPattern.append(u'^');
-            isComplement = true;
+        if (!escaped && c == u'[') {
+            prettyPrintedPattern.append(u'[');
+            RuleCharacterIterator::Pos afterBracket;
+            chars.getPos(afterBracket);
+            c = chars.next(charsOptions(options), escaped, ec);
+            U_UNICODESET_RETURN_IF_ERROR(ec);
+            if (!escaped && c == u'^') {
+                prettyPrintedPattern.append(u'^');
+                isComplement = true;
+            } else {
+                chars.setPos(afterBracket);
+            }
+            parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth,
+                       /*containsRestrictions=*/preserveSyntaxInPattern, ec);
+            U_UNICODESET_RETURN_IF_ERROR(ec);
+            c = chars.next(charsOptions(options), escaped, ec);
+            U_UNICODESET_RETURN_IF_ERROR(ec);
+            if (escaped || c != u']') {
+                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
+            }
+            prettyPrintedPattern.append(u']');
         } else {
-            chars.setPos(afterBracket);
-        }
-        parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth,
-                   /*containsRestrictions=*/preserveSyntaxInPattern, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        c = chars.next(charsOptions(options), escaped, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (escaped || c != u']') {
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
+            const UnicodeSet *set = getMatcherSymbol(symbols, c);
+            if (set != nullptr) {
+                *this = *set;
+                this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false);
+                return;
+            }
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
         }
-        prettyPrintedPattern.append(u']');
     }
 
     /**
@@ -462,7 +481,7 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
     //         | Terms Term
     UChar32 c = chars.next(charsOptions(options), escaped, ec);
     U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (!escaped && c == u'-') {
+    if (!escaped && c == u'-' && getMatcherSymbol(symbols, c)) {
         add(u'-');
         // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
         // final one, for consistency with older ICU behaviour.
@@ -474,28 +493,30 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
         chars.getPos(position);
         c = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (!escaped && c == u'-') {
-            // We can be here on the first iteration: [--] is allowed by the
-            // grammar and by the old parser.
-            rebuiltPat.append(u'-');
-            add(u'-');
-            return;
-        } else if (!escaped && c == u'$') {
-            RuleCharacterIterator::Pos afterDollar;
-            chars.getPos(afterDollar);
-            c = chars.next(charsOptions(options), escaped, ec);
-            if (!escaped && c == u']') {
-                // ICU extensions: A $ is allowed as a literal-element.
-                // A Term at the end of a Union consisting of a single $ is an anchor.
-                rebuiltPat.append(u'$');
-                chars.setPos(afterDollar);
-                add(U_ETHER);
-                containsRestrictions = true;
+        if (getMatcherSymbol(symbols, c) == nullptr) {
+            if (!escaped && c == u'-') {
+                // We can be here on the first iteration: [--] is allowed by the
+                // grammar and by the old parser.
+                rebuiltPat.append(u'-');
+                add(u'-');
                 return;
+            } else if (!escaped && c == u'$') {
+                RuleCharacterIterator::Pos afterDollar;
+                chars.getPos(afterDollar);
+                c = chars.next(charsOptions(options), escaped, ec);
+                if (!escaped && c == u']') {
+                    // ICU extensions: A $ is allowed as a literal-element.
+                    // A Term at the end of a Union consisting of a single $ is an anchor.
+                    rebuiltPat.append(u'$');
+                    chars.setPos(afterDollar);
+                    add(U_ETHER);
+                    containsRestrictions = true;
+                    return;
+                }
             }
         }
         chars.setPos(position);
-        if (!escaped && c == ']') {
+        if (!escaped && c == ']' && getMatcherSymbol(symbols, c) == nullptr) {
             return;
         }
         parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions,
@@ -521,7 +542,8 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern,
     //        | Restriction
     const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
     chars.setPos(termStart);
-    if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) {
+    if (getMatcherSymbol(symbols, ahead) != nullptr || !escaped && ahead == '[' ||
+        resemblesPropertyPattern(chars, charsOptions(options))) {
         containsRestriction = true;
         parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
@@ -557,6 +579,11 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern,
         chars.getPos(beforeOperator);
         const UChar32 op = chars.next(charsOptions(options), escaped, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
+        if (getMatcherSymbol(symbols, op)) {
+            // Not an operator, end of the Restriction.
+            chars.setPos(beforeOperator);
+            return;
+        }
         if (!escaped && op == u'&') {
             // Intersection ::= Restriction & UnicodeSet
             rebuiltPat.append(u'&');
@@ -650,7 +677,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
     chars.getPos(beforeOperator);
     const UChar32 op = chars.next(charsOptions(options), escaped, ec);
     U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (escaped || op != u'-') {
+    if (escaped || op != u'-' || getMatcherSymbol(symbols, op) != nullptr) {
         // No operator,
         // Elements ::= Element
         chars.setPos(beforeOperator);
@@ -670,6 +697,9 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
     // Elements ::= Range ::= RangeElement - RangeElement
     rebuiltPat.append(u'-');
     const UChar32 last = ahead;
+    if (getMatcherSymbol(symbols, last) != nullptr) {
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec);
+    }
     if (!escaped) {
         switch (last) {
         case u'-':

From 4beef14bc60d21091c3c7e3d8be6efcc46535ce9 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 13:59:00 +0200
Subject: [PATCH 19/56] indentation on the parse error tests

---
 icu4c/source/test/intltest/usettest.cpp | 80 ++++++++++++-------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 5415940918ad..c5de484f9100 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4404,39 +4404,39 @@ void UnicodeSetTest::TestToPatternOutput() {
 
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
-             // Java error message: "Char expected after operator".
-             u"[a-[b]]",
-             // "Missing '['".
-             u"a-z",
-             // "Trailing '&'".
-             u"[[a]&]",
-             // "'-' not after char or set".
-             u"[[a]&-[z]]",
-             u"[[a]--[z]]",
-             u"[{aa}-{zz}]",
-             // "'&' not after set".
-             u"[a&z]",
-             u"[{aa}&{zz}]",
-             // "'^' not after '['"
-             u"[a^z]",  // TODO(egg): Exclude from literal-element in PDUTS61.
-             // "Missing operand after operator".
-             u"[a-{zz}]",
-             u"[[a]-{zz}]",
-             u"[[a]&{zz}]",
-             // "Invalid multicharacter string".
-             u"[{aa]",
-             // "Unquoted '$'".
-             u"[a-$]",
-             u"[!-$]",
-             // "Invalid range".
-             u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
-             u"[z-a]",
-             // "Set expected after operator".
-             u"[[a]-z]",
-             u"[[a]&z]",
-             // "Missing ']'".
-             u"[a-z",
-         }) {
+            // Java error message: "Char expected after operator".
+            u"[a-[b]]",
+            // "Missing '['".
+            u"a-z",
+            // "Trailing '&'".
+            u"[[a]&]",
+            // "'-' not after char or set".
+            u"[[a]&-[z]]",
+            u"[[a]--[z]]",
+            u"[{aa}-{zz}]",
+            // "'&' not after set".
+            u"[a&z]",
+            u"[{aa}&{zz}]",
+            // "'^' not after '['"
+            u"[a^z]",  // TODO(egg): Exclude from literal-element in PDUTS61.
+            // "Missing operand after operator".
+            u"[a-{zz}]",
+            u"[[a]-{zz}]",
+            u"[[a]&{zz}]",
+            // "Invalid multicharacter string".
+            u"[{aa]",
+            // "Unquoted '$'".
+            u"[a-$]",
+            u"[!-$]",
+            // "Invalid range".
+            u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
+            u"[z-a]",
+            // "Set expected after operator".
+            u"[[a]-z]",
+            u"[[a]&z]",
+            // "Missing ']'".
+            u"[a-z",
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         if (errorCode != U_MALFORMED_SET) {
@@ -4446,13 +4446,13 @@ void UnicodeSetTest::TestParseErrors() {
         }
     }
     for (const auto expression : std::vector<std::u16string_view>{
-             // Java error message: "Invalid property pattern".
-             u"[:]",
-             uR"(\p)"
-             u"[:^]",
-             uR"(\P)",
-             uR"(\N)",
-         }) {
+            // Java error message: "Invalid property pattern".
+            u"[:]",
+            uR"(\p)"
+            u"[:^]",
+            uR"(\P)",
+            uR"(\N)",
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {

From 18f2b7b7abdfc9c2f5f129ba0a0508298b03b4b5 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 11 Aug 2025 16:24:10 +0200
Subject: [PATCH 20/56] ICU-22851 Test the error paths in UnicodeSet parsing

---
 icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  2 +
 2 files changed, 63 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..76ab11424110 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
 
@@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() {
     // begin() & end() return USetElementIterator for which explicit APIs are tested via USet
     // in a header-only unit test file.
 }
+
+void UnicodeSetTest::TestParseErrors() {
+    for (const auto expression : std::vector<std::u16string_view>{
+            // Java error message: "Char expected after operator".
+            u"[a-[b]]",
+            // "Missing '['".
+            u"a-z",
+            // "Trailing '&'".
+            u"[[a]&]",
+            // "'-' not after char or set".
+            u"[[a]&-[z]]",
+            u"[[a]--[z]]",
+            u"[{aa}-{zz}]",
+            // "'&' not after set".
+            u"[a&z]",
+            u"[{aa}&{zz}]",
+            // "'^' not after '['"
+            u"[a^z]",  // TODO(egg): Exclude from literal-element in PDUTS61.
+            // "Missing operand after operator".
+            u"[a-{zz}]",
+            u"[[a]-{zz}]",
+            u"[[a]&{zz}]",
+            // "Invalid multicharacter string".
+            u"[{aa]",
+            // "Unquoted '$'".
+            u"[a-$]",
+            // "Invalid range".
+            u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
+            u"[z-a]",
+            // "Set expected after operator".
+            u"[[a]-z]",
+            u"[[a]&z]",
+            // "Missing ']'".
+            u"[a-z",
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_MALFORMED_SET) {
+            UnicodeString s;
+            errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+    for (const auto expression : std::vector<std::u16string_view>{
+            // Java error message: "Invalid property pattern".
+            u"[:]",
+            uR"(\p)"
+            u"[:^]",
+            uR"(\P)",
+            uR"(\N)",
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {
+            UnicodeString s;
+            errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+}
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..4c5b55a329bb 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestParseErrors();
+
 private:
 
     UBool toPatternAux(UChar32 start, UChar32 end);

From 03f792b2c5a619d6e76b3975ee6551c475392ba7 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 21/56] ICU-22851 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 56 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 76ab11424110..3b1a6012915e 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto &[expression, expected] :
+         std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+             // For a UnicodeSet which is not a property-query nor a named-element and without any
+             // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+             // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+             // minimize the result.
+             {u"[c-za-b]", u"[a-z]"},
+             {u"[  c - z  a - b  ]", u"[a-z]"},
+             {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+             {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+             {u"[ - - ]", uR"([\-])"},
+             {u"[ - _ - ]", uR"([\-_])"},
+             {u"[ - + - ]", uR"([+\-])"},
+             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             // A property-query or named-element is kept as-is:
+             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+             {uR"(\p{P})", uR"(\p{P})"},
+             {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+             {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+             {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+             {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+             {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+             {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+              uR"([c-za-b\p{ General_Category = Punctuation }])"},
+             {u"[^[c]]", uR"([^[c]])"},
+             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             // Spaces are eliminated within a string-literal even when the syntax is preserved.
+             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+             // Escapes are removed even when the syntax is preserved.
+             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+              u"[{Zeichenkette}[]Zeichenmenge]"},
+             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+             // preserved.
+             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
             // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From 8cc53b95074a4a806961c7b12849a30c7a97fac5 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:51:47 +0200
Subject: [PATCH 22/56] ICU-22851 Test various edge cases with $ in the absence
 of variables

---
 icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 3b1a6012915e..c5de484f9100 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() {
              {u"[ - + - ]", uR"([+\-])"},
              {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
              {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+             {u"[$d-za-c]", uR"([\$a-z])"},
+             {u"[a-c$d-z]", uR"([\$a-z])"},
+             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+             {u"[!-$z]", uR"([!-\$z])"},
+             {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+             {u"[-$-]", uR"([\$\-])"},
              // A property-query or named-element is kept as-is:
              {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
              {uR"(\p{P})", uR"(\p{P})"},
@@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() {
               uR"([c-za-b\p{ General_Category = Punctuation }])"},
              {u"[^[c]]", uR"([^[c]])"},
              {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+             {u"[$[]]", uR"([\$[]])"},
              // Spaces are eliminated within a string-literal even when the syntax is preserved.
              {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
              // Escapes are removed even when the syntax is preserved.
@@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() {
              // A named-element is currently a nested set, so it is preserved and causes the syntax to be
              // preserved.
              {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+             // An anchor also causes the syntax to be preserved.
+             {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+             {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+             {u"[$$$]", uR"([\$\$$])"},
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() {
             u"[{aa]",
             // "Unquoted '$'".
             u"[a-$]",
+            u"[!-$]",
             // "Invalid range".
             u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
             u"[z-a]",

From 65fe08e56cb567da39ec2574f874001c1c80e644 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 14:05:48 +0200
Subject: [PATCH 23/56] dedent the pattern output test

---
 icu4c/source/test/intltest/usettest.cpp | 102 ++++++++++++------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index c5de484f9100..89a15dcc489d 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4339,57 +4339,57 @@ void UnicodeSetTest::TestElementIterator() {
 
 void UnicodeSetTest::TestToPatternOutput() {
     for (const auto &[expression, expected] :
-         std::vector<std::pair<std::u16string_view, std::u16string_view>>{
-             // For a UnicodeSet which is not a property-query nor a named-element and without any
-             // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
-             // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
-             // minimize the result.
-             {u"[c-za-b]", u"[a-z]"},
-             {u"[  c - z  a - b  ]", u"[a-z]"},
-             {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
-             {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
-             {u"[ - - ]", uR"([\-])"},
-             {u"[ - _ - ]", uR"([\-_])"},
-             {u"[ - + - ]", uR"([+\-])"},
-             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
-             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
-             {u"[$d-za-c]", uR"([\$a-z])"},
-             {u"[a-c$d-z]", uR"([\$a-z])"},
-             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
-             {u"[!-$z]", uR"([!-\$z])"},
-             {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
-             {u"[-$-]", uR"([\$\-])"},
-             // A property-query or named-element is kept as-is:
-             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
-             {uR"(\p{P})", uR"(\p{P})"},
-             {uR"(\p{gc=P})", uR"(\p{gc=P})"},
-             {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
-             {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
-             {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
-             {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
-             // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
-             // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
-             // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
-             {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
-             {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
-             {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
-              uR"([c-za-b\p{ General_Category = Punctuation }])"},
-             {u"[^[c]]", uR"([^[c]])"},
-             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
-             {u"[$[]]", uR"([\$[]])"},
-             // Spaces are eliminated within a string-literal even when the syntax is preserved.
-             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
-             // Escapes are removed even when the syntax is preserved.
-             {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
-              u"[{Zeichenkette}[]Zeichenmenge]"},
-             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
-             // preserved.
-             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
-             // An anchor also causes the syntax to be preserved.
-             {u"[ d-z a-c $ ]", u"[d-za-c$]"},
-             {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
-             {u"[$$$]", uR"([\$\$$])"},
-         }) {
+        std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+            // For a UnicodeSet which is not a property-query nor a named-element and without any
+            // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+            // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+            // minimize the result.
+            {u"[c-za-b]", u"[a-z]"},
+            {u"[  c - z  a - b  ]", u"[a-z]"},
+            {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+            {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+            {u"[ - - ]", uR"([\-])"},
+            {u"[ - _ - ]", uR"([\-_])"},
+            {u"[ - + - ]", uR"([+\-])"},
+            {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+            {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+            {u"[$d-za-c]", uR"([\$a-z])"},
+            {u"[a-c$d-z]", uR"([\$a-z])"},
+            {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+            {u"[!-$z]", uR"([!-\$z])"},
+            {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+            {u"[-$-]", uR"([\$\-])"},
+            // A property-query or named-element is kept as-is:
+            {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+            {uR"(\p{P})", uR"(\p{P})"},
+            {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+            {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+            {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+            {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+            {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+            // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+            // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+            // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+            {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+            {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+            {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+            uR"([c-za-b\p{ General_Category = Punctuation }])"},
+            {u"[^[c]]", uR"([^[c]])"},
+            {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+            {u"[$[]]", uR"([\$[]])"},
+            // Spaces are eliminated within a string-literal even when the syntax is preserved.
+            {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+            // Escapes are removed even when the syntax is preserved.
+            {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+            u"[{Zeichenkette}[]Zeichenmenge]"},
+            // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+            // preserved.
+            {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+            // An anchor also causes the syntax to be preserved.
+            {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+            {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+            {u"[$$$]", uR"([\$\$$])"},
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         UnicodeString actual;

From b478400a6d23cac79da5e39e5a20279cf797058e Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 24/56] ICU-22851 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 56 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 76ab11424110..9e0e66fac3b1 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto &[expression, expected] :
+        std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+            // For a UnicodeSet which is not a property-query nor a named-element and without any
+            // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+            // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+            // minimize the result.
+            {u"[c-za-b]", u"[a-z]"},
+            {u"[  c - z  a - b  ]", u"[a-z]"},
+            {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+            {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+            {u"[ - - ]", uR"([\-])"},
+            {u"[ - _ - ]", uR"([\-_])"},
+            {u"[ - + - ]", uR"([+\-])"},
+            {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+            {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+            // A property-query or named-element is kept as-is:
+            {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+            {uR"(\p{P})", uR"(\p{P})"},
+            {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+            {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+            {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+            {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+            {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+            // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+            // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+            // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+            {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+            {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+            {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+            uR"([c-za-b\p{ General_Category = Punctuation }])"},
+            {u"[^[c]]", uR"([^[c]])"},
+            {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+            // Spaces are eliminated within a string-literal even when the syntax is preserved.
+            {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+            // Escapes are removed even when the syntax is preserved.
+            {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+            u"[{Zeichenkette}[]Zeichenmenge]"},
+            // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+            // preserved.
+            {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
             // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From ae81d41bf8d775a37fcb0390e22372ef76815829 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:51:47 +0200
Subject: [PATCH 25/56] ICU-22851 Test various edge cases with $ in the absence
 of variables

---
 icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 9e0e66fac3b1..89a15dcc489d 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[ - + - ]", uR"([+\-])"},
             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+            {u"[$d-za-c]", uR"([\$a-z])"},
+            {u"[a-c$d-z]", uR"([\$a-z])"},
+            {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+            {u"[!-$z]", uR"([!-\$z])"},
+            {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+            {u"[-$-]", uR"([\$\-])"},
             // A property-query or named-element is kept as-is:
             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
             {uR"(\p{P})", uR"(\p{P})"},
@@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() {
             uR"([c-za-b\p{ General_Category = Punctuation }])"},
             {u"[^[c]]", uR"([^[c]])"},
             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+            {u"[$[]]", uR"([\$[]])"},
             // Spaces are eliminated within a string-literal even when the syntax is preserved.
             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
             // Escapes are removed even when the syntax is preserved.
@@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() {
             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
             // preserved.
             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+            // An anchor also causes the syntax to be preserved.
+            {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+            {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+            {u"[$$$]", uR"([\$\$$])"},
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() {
             u"[{aa]",
             // "Unquoted '$'".
             u"[a-$]",
+            u"[!-$]",
             // "Invalid range".
             u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
             u"[z-a]",

From 8eec9710279fdf276abe00ff20c23f758031e7a0 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 11 Aug 2025 16:24:10 +0200
Subject: [PATCH 26/56] ICU-23179 Test the error paths in UnicodeSet parsing

---
 icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  2 +
 2 files changed, 63 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..76ab11424110 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
 
@@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() {
     // begin() & end() return USetElementIterator for which explicit APIs are tested via USet
     // in a header-only unit test file.
 }
+
+void UnicodeSetTest::TestParseErrors() {
+    for (const auto expression : std::vector<std::u16string_view>{
+            // Java error message: "Char expected after operator".
+            u"[a-[b]]",
+            // "Missing '['".
+            u"a-z",
+            // "Trailing '&'".
+            u"[[a]&]",
+            // "'-' not after char or set".
+            u"[[a]&-[z]]",
+            u"[[a]--[z]]",
+            u"[{aa}-{zz}]",
+            // "'&' not after set".
+            u"[a&z]",
+            u"[{aa}&{zz}]",
+            // "'^' not after '['"
+            u"[a^z]",  // TODO(egg): Exclude from literal-element in PDUTS61.
+            // "Missing operand after operator".
+            u"[a-{zz}]",
+            u"[[a]-{zz}]",
+            u"[[a]&{zz}]",
+            // "Invalid multicharacter string".
+            u"[{aa]",
+            // "Unquoted '$'".
+            u"[a-$]",
+            // "Invalid range".
+            u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
+            u"[z-a]",
+            // "Set expected after operator".
+            u"[[a]-z]",
+            u"[[a]&z]",
+            // "Missing ']'".
+            u"[a-z",
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_MALFORMED_SET) {
+            UnicodeString s;
+            errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+    for (const auto expression : std::vector<std::u16string_view>{
+            // Java error message: "Invalid property pattern".
+            u"[:]",
+            uR"(\p)"
+            u"[:^]",
+            uR"(\P)",
+            uR"(\N)",
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {
+            UnicodeString s;
+            errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
+}
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..4c5b55a329bb 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestParseErrors();
+
 private:
 
     UBool toPatternAux(UChar32 start, UChar32 end);

From dabce0b5dc10dcb05a8850869ef06659e478e386 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 13 Aug 2025 21:03:33 +0200
Subject: [PATCH 27/56] ICU-23179 Test the exact behaviour of
 UnicodeSet::toPattern

---
 icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++
 icu4c/source/test/intltest/usettest.h   |  1 +
 2 files changed, 56 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 76ab11424110..9e0e66fac3b1 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestRangeIterator);
     TESTCASE_AUTO(TestStringIterator);
     TESTCASE_AUTO(TestElementIterator);
+    TESTCASE_AUTO(TestToPatternOutput);
     TESTCASE_AUTO(TestParseErrors);
     TESTCASE_AUTO_END;
 }
@@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() {
     // in a header-only unit test file.
 }
 
+void UnicodeSetTest::TestToPatternOutput() {
+    for (const auto &[expression, expected] :
+        std::vector<std::pair<std::u16string_view, std::u16string_view>>{
+            // For a UnicodeSet which is not a property-query nor a named-element and without any
+            // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements
+            // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to
+            // minimize the result.
+            {u"[c-za-b]", u"[a-z]"},
+            {u"[  c - z  a - b  ]", u"[a-z]"},
+            {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"},
+            {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"},
+            {u"[ - - ]", uR"([\-])"},
+            {u"[ - _ - ]", uR"([\-_])"},
+            {u"[ - + - ]", uR"([+\-])"},
+            {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
+            {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+            // A property-query or named-element is kept as-is:
+            {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
+            {uR"(\p{P})", uR"(\p{P})"},
+            {uR"(\p{gc=P})", uR"(\p{gc=P})"},
+            {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"},
+            {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"},
+            {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"},
+            {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"},
+            // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are
+            // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped.
+            // This is applied recursively, so innermost ranges-only UnicodeSets get normalized.
+            {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"},
+            {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"},
+            {uR"([ c-z a-b \p{ General_Category = Punctuation } ])",
+            uR"([c-za-b\p{ General_Category = Punctuation }])"},
+            {u"[^[c]]", uR"([^[c]])"},
+            {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+            // Spaces are eliminated within a string-literal even when the syntax is preserved.
+            {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
+            // Escapes are removed even when the syntax is preserved.
+            {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
+            u"[{Zeichenkette}[]Zeichenmenge]"},
+            // A named-element is currently a nested set, so it is preserved and causes the syntax to be
+            // preserved.
+            {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        UnicodeString actual;
+        if (U_FAILURE(errorCode)) {
+            errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode));
+        } else if (set.toPattern(actual) != expected) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " +
+                  actual);
+        }
+    }
+}
+
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
             // Java error message: "Char expected after operator".
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 4c5b55a329bb..692aa8b9e84d 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest {
     void TestStringIterator();
     void TestElementIterator();
 
+    void TestToPatternOutput();
     void TestParseErrors();
 
 private:

From 6bd042524cf39ec264b6b1eca5e6e26ef73e0cba Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 14 Aug 2025 15:51:47 +0200
Subject: [PATCH 28/56] ICU-23179 Test various edge cases with $ in the absence
 of variables

---
 icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 9e0e66fac3b1..89a15dcc489d 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[ - + - ]", uR"([+\-])"},
             {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
             {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
+            {u"[$d-za-c]", uR"([\$a-z])"},
+            {u"[a-c$d-z]", uR"([\$a-z])"},
+            {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
+            {u"[!-$z]", uR"([!-\$z])"},
+            {u"[-a-cd-z$-]", uR"([\$\-a-z])"},
+            {u"[-$-]", uR"([\$\-])"},
             // A property-query or named-element is kept as-is:
             {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"},
             {uR"(\p{P})", uR"(\p{P})"},
@@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() {
             uR"([c-za-b\p{ General_Category = Punctuation }])"},
             {u"[^[c]]", uR"([^[c]])"},
             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
+            {u"[$[]]", uR"([\$[]])"},
             // Spaces are eliminated within a string-literal even when the syntax is preserved.
             {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
             // Escapes are removed even when the syntax is preserved.
@@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() {
             // A named-element is currently a nested set, so it is preserved and causes the syntax to be
             // preserved.
             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"},
+            // An anchor also causes the syntax to be preserved.
+            {u"[ d-z a-c $ ]", u"[d-za-c$]"},
+            {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"},
+            {u"[$$$]", uR"([\$\$$])"},
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() {
             u"[{aa]",
             // "Unquoted '$'".
             u"[a-$]",
+            u"[!-$]",
             // "Invalid range".
             u"[a-a]",  // TODO(egg): Exclude in PDUTS61.
             u"[z-a]",

From d6fc731e0ef8bce057cf77e30011fa3640c69be9 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 14:21:59 +0200
Subject: [PATCH 29/56] meow

---
 icu4c/source/test/intltest/usettest.cpp | 135 +++++++++++++-----------
 1 file changed, 74 insertions(+), 61 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 06de9e315aac..4d52c95d0e4a 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -1823,27 +1823,35 @@ void UnicodeSetTest::TestSymbolTable() {
             logln(UnicodeString("Ok, got ") + us.toPattern(a, true));
         }
     }
-    for (const auto &[variables, expression, expectedErrorCode, expectedPattern] :
-         std::vector<std::tuple<std::vector<std::pair<std::u16string_view, std::u16string_view>>,
-                                std::u16string_view, UErrorCode, std::u16string_view>>{
-             // You should not do this, but it works.
-             {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}},
-              u"$privateUseOrUnassigned$close",
-              U_ZERO_ERROR,
-              u"[[:Co:][:Cn:]]"},
-             // This works and it is fine.
-             {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"},
-             // This should work! But it does not. Note the doubled brackets on the one that works above.
-             // We are not yet inside the variable when we call lookahead(), so we try to parse
-             // $privateUse rather than [:Co:].
-             {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"},
-             // This should not work, and it does not (we try to parse [$sad$surprised] as a
-             // property-query).
-             {{{u"sad", u":C"}, {u"surprised", u"o:"}},
-              u"[$sad$surprised]",
-              U_ILLEGAL_ARGUMENT_ERROR,
-              u"[]"},
-         }) {
+    struct TestCase {
+        struct Variable {
+            std::u16string_view name;
+            std::u16string_view value;
+        };
+        std::vector<Variable> variables;
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+    };
+    for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector<TestCase>{
+            // You should not do this, but it works.
+            {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}},
+            u"$privateUseOrUnassigned$close",
+            U_ZERO_ERROR,
+            u"[[:Co:][:Cn:]]"},
+            // This works and it is fine.
+            {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"},
+            // This should work! But it does not. Note the doubled brackets on the one that works above.
+            // We are not yet inside the variable when we call lookahead(), so we try to parse
+            // $privateUse rather than [:Co:].
+            {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"},
+            // This should not work, and it does not (we try to parse [$sad$surprised] as a
+            // property-query).
+            {{{u"sad", u":C"}, {u"surprised", u"o:"}},
+            u"[$sad$surprised]",
+            U_ILLEGAL_ARGUMENT_ERROR,
+            u"[]"},
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         TokenSymbolTable symbols(errorCode);
         if (U_FAILURE(errorCode)) {
@@ -1899,20 +1907,26 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
     symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
     symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
+    struct TestCase {
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+        std::u16string_view expectedRegeneratedPattern;
+    };
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-         std::vector<std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
-             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
-             {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
-             {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
-             // Substitution of lookupMatcher symbols takes place after de-escaping.
-             {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
-             // It does not take place in string literals.
-             {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
-             {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
-             {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
-              u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
-             {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
-         }) {
+        std::vector<TestCase>{
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+            // Substitution of lookupMatcher symbols takes place after unescaping.
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+            // It does not take place in string literals.
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
+            {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
+            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+        }) {
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
@@ -1942,33 +1956,32 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
     symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-         std::vector<
-             std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
-             {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
-             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
-             // The hyphen no longer works as set difference.
-             {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
-             {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
-             // String literals no longer work.
-             {uR"([!-/{0}])", U_ZERO_ERROR,
-              u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
-              u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
-             // The ampersand no longer works as set difference.
-             {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
-              u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
-              // Complementing still works.
-             {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
-              uR"([\u0001-\U0010FFFF])"},
-              // ^ elsewhere becomes a symbol rather than a syntax error.
-             {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
-              uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
-             // Opening brackets still work.
-             {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
-             // The only way to access the [ symbol is via escaping.
-             {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
-             // Anchors are gone.
-             {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
-         }) {
+        std::vector<TestCase>{
+            {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            // The hyphen no longer works as set difference.
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
+            {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
+            // String literals no longer work.
+            {uR"([!-/{0}])", U_ZERO_ERROR,
+            u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
+            u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
+            // The ampersand no longer works as set difference.
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
+            u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
+            // Complementing still works.
+            {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
+            uR"([\u0001-\U0010FFFF])"},
+            // ^ elsewhere becomes a symbol rather than a syntax error.
+            {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
+            uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
+            // Opening brackets still work.
+            {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
+            // The only way to access the [ symbol is via escaping.
+            {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
+            // Anchors are gone.
+            {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
+        }) {
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);

From 83bf69b486f0a629ea4d06845e3aab5555c400de Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 14:23:32 +0200
Subject: [PATCH 30/56] ICU-23179 Test UnicodeSet with lookupMatcher

---
 icu4c/source/test/intltest/usettest.cpp | 216 +++++++++++++++++++++++-
 icu4c/source/test/intltest/usettest.h   |   1 +
 2 files changed, 216 insertions(+), 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..4d52c95d0e4a 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include <array>
 #include <string_view>
 #include <unordered_map>
 
@@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestEscapePattern);
     TESTCASE_AUTO(TestInvalidCodePoint);
     TESTCASE_AUTO(TestSymbolTable);
+    TESTCASE_AUTO(TestLookupSymbolTable);
     TESTCASE_AUTO(TestSurrogate);
     TESTCASE_AUTO(TestPosixClasses);
     TESTCASE_AUTO(TestIteration);
@@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() {
     // Multiple test cases can be set up here.  Each test case
     // is terminated by null:
     // var, value, var, value,..., input pat., exp. output pat., null
-    const char* DATA[] = {
+    const char *DATA[] = {
         "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr,
         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr,
         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr,
+        // Things that probably should not work, but currently do:
+        "open", "[", "$open a-z]", "[a-z]", nullptr,
+        "open", "[", "close", "]", "hyphenMinus", "-",
+            "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]",
+            "[[a-z]-[c-z]-]", nullptr,
+        "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr,
+        "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr,
+        "smiling", ":-]", "laughing", ":-D",
+            "[ {$smiling} $laughing $smiling",
+            R"([\-\:-D{\:\-\]}])", nullptr,
         nullptr
     };
 
@@ -1811,6 +1823,208 @@ void UnicodeSetTest::TestSymbolTable() {
             logln(UnicodeString("Ok, got ") + us.toPattern(a, true));
         }
     }
+    struct TestCase {
+        struct Variable {
+            std::u16string_view name;
+            std::u16string_view value;
+        };
+        std::vector<Variable> variables;
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+    };
+    for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector<TestCase>{
+            // You should not do this, but it works.
+            {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}},
+            u"$privateUseOrUnassigned$close",
+            U_ZERO_ERROR,
+            u"[[:Co:][:Cn:]]"},
+            // This works and it is fine.
+            {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"},
+            // This should work! But it does not. Note the doubled brackets on the one that works above.
+            // We are not yet inside the variable when we call lookahead(), so we try to parse
+            // $privateUse rather than [:Co:].
+            {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"},
+            // This should not work, and it does not (we try to parse [$sad$surprised] as a
+            // property-query).
+            {{{u"sad", u":C"}, {u"surprised", u"o:"}},
+            u"[$sad$surprised]",
+            U_ILLEGAL_ARGUMENT_ERROR,
+            u"[]"},
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        TokenSymbolTable symbols(errorCode);
+        if (U_FAILURE(errorCode)) {
+            errln("FAIL: Couldn’t construct symbol table");
+            continue;
+        }
+        for (const auto &[name, value] : variables) {
+            symbols.add(name, value, errorCode);
+            if (U_FAILURE(errorCode)) {
+                errln("FAIL: Couldn’t add variable " + name);
+                continue;
+            }
+        }
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        UnicodeString actual;
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+    }
+}
+
+void UnicodeSetTest::TestLookupSymbolTable() {
+    UErrorCode errorCode = U_ZERO_ERROR;
+    class TestSymbolTable : public SymbolTable {
+      public:
+        const UnicodeString *lookup(const UnicodeString &) const override {
+            return nullptr;
+        }
+
+        const UnicodeFunctor *lookupMatcher(UChar32 c) const override {
+            return symbols_.find(c) != symbols_.end() ? &symbols_.at(c)
+                                                                    : nullptr;
+        }
+
+        virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &,
+                                             int32_t) const override {
+            return u"";
+        }
+
+        void add(UChar32 c, UnicodeSet set) {
+            symbols_[c] = set;
+        }
+
+      private:
+        std::unordered_map<UChar32, UnicodeSet> symbols_;
+    };
+    TestSymbolTable symbols;
+    symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
+    symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
+    symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
+    struct TestCase {
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+        std::u16string_view expectedRegeneratedPattern;
+    };
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+        std::vector<TestCase>{
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+            // Substitution of lookupMatcher symbols takes place after unescaping.
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+            // It does not take place in string literals.
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
+            {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
+            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+        }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
+    // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
+    // change it unknowingly.
+    symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode));
+    symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode));
+    // This one is never used, except if escaped.
+    symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode));
+    symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode));
+    symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
+    symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
+    symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+        std::vector<TestCase>{
+            {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            // The hyphen no longer works as set difference.
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
+            {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
+            // String literals no longer work.
+            {uR"([!-/{0}])", U_ZERO_ERROR,
+            u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
+            u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
+            // The ampersand no longer works as set difference.
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
+            u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
+            // Complementing still works.
+            {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
+            uR"([\u0001-\U0010FFFF])"},
+            // ^ elsewhere becomes a symbol rather than a syntax error.
+            {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
+            uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
+            // Opening brackets still work.
+            {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
+            // The only way to access the [ symbol is via escaping.
+            {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
+            // Anchors are gone.
+            {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
+        }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
+    // constructor returns an error but not an empty set. Don’t do that.
+    symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+         std::vector<
+             std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
+             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+         }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
 }
 
 void UnicodeSetTest::TestSurrogate() {
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..32abf828a30a 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest {
     void TestInvalidCodePoint();
 
     void TestSymbolTable();
+    void TestLookupSymbolTable();
 
     void TestSurrogate();
 

From ed395a63ccd64c7cfe65f21da4495385a0ba8d02 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 14:27:08 +0200
Subject: [PATCH 31/56] meow

---
 icu4c/source/test/intltest/usettest.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 4d52c95d0e4a..a5c249d4da37 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -2003,11 +2003,10 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     // constructor returns an error but not an empty set. Don’t do that.
     symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-         std::vector<
-             std::tuple<std::u16string_view, UErrorCode, std::u16string_view, std::u16string_view>>{
-             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
-             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
-         }) {
+        std::vector<TestCase>{
+            {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+            {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+        }) {
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);

From 3a4ab4575839f94200b9b2287a73d69065282933 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 14:23:32 +0200
Subject: [PATCH 32/56] ICU-23179 Test UnicodeSet with lookupMatcher

---
 icu4c/source/test/intltest/usettest.cpp | 215 +++++++++++++++++++++++-
 icu4c/source/test/intltest/usettest.h   |   1 +
 2 files changed, 215 insertions(+), 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 016d3f85e63d..a5c249d4da37 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string.h>
 
+#include <array>
 #include <string_view>
 #include <unordered_map>
 
@@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
     TESTCASE_AUTO(TestEscapePattern);
     TESTCASE_AUTO(TestInvalidCodePoint);
     TESTCASE_AUTO(TestSymbolTable);
+    TESTCASE_AUTO(TestLookupSymbolTable);
     TESTCASE_AUTO(TestSurrogate);
     TESTCASE_AUTO(TestPosixClasses);
     TESTCASE_AUTO(TestIteration);
@@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() {
     // Multiple test cases can be set up here.  Each test case
     // is terminated by null:
     // var, value, var, value,..., input pat., exp. output pat., null
-    const char* DATA[] = {
+    const char *DATA[] = {
         "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr,
         "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr,
         "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr,
+        // Things that probably should not work, but currently do:
+        "open", "[", "$open a-z]", "[a-z]", nullptr,
+        "open", "[", "close", "]", "hyphenMinus", "-",
+            "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]",
+            "[[a-z]-[c-z]-]", nullptr,
+        "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr,
+        "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr,
+        "smiling", ":-]", "laughing", ":-D",
+            "[ {$smiling} $laughing $smiling",
+            R"([\-\:-D{\:\-\]}])", nullptr,
         nullptr
     };
 
@@ -1811,6 +1823,207 @@ void UnicodeSetTest::TestSymbolTable() {
             logln(UnicodeString("Ok, got ") + us.toPattern(a, true));
         }
     }
+    struct TestCase {
+        struct Variable {
+            std::u16string_view name;
+            std::u16string_view value;
+        };
+        std::vector<Variable> variables;
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+    };
+    for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector<TestCase>{
+            // You should not do this, but it works.
+            {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}},
+            u"$privateUseOrUnassigned$close",
+            U_ZERO_ERROR,
+            u"[[:Co:][:Cn:]]"},
+            // This works and it is fine.
+            {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"},
+            // This should work! But it does not. Note the doubled brackets on the one that works above.
+            // We are not yet inside the variable when we call lookahead(), so we try to parse
+            // $privateUse rather than [:Co:].
+            {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"},
+            // This should not work, and it does not (we try to parse [$sad$surprised] as a
+            // property-query).
+            {{{u"sad", u":C"}, {u"surprised", u"o:"}},
+            u"[$sad$surprised]",
+            U_ILLEGAL_ARGUMENT_ERROR,
+            u"[]"},
+        }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        TokenSymbolTable symbols(errorCode);
+        if (U_FAILURE(errorCode)) {
+            errln("FAIL: Couldn’t construct symbol table");
+            continue;
+        }
+        for (const auto &[name, value] : variables) {
+            symbols.add(name, value, errorCode);
+            if (U_FAILURE(errorCode)) {
+                errln("FAIL: Couldn’t add variable " + name);
+                continue;
+            }
+        }
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        UnicodeString actual;
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+    }
+}
+
+void UnicodeSetTest::TestLookupSymbolTable() {
+    UErrorCode errorCode = U_ZERO_ERROR;
+    class TestSymbolTable : public SymbolTable {
+      public:
+        const UnicodeString *lookup(const UnicodeString &) const override {
+            return nullptr;
+        }
+
+        const UnicodeFunctor *lookupMatcher(UChar32 c) const override {
+            return symbols_.find(c) != symbols_.end() ? &symbols_.at(c)
+                                                                    : nullptr;
+        }
+
+        virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &,
+                                             int32_t) const override {
+            return u"";
+        }
+
+        void add(UChar32 c, UnicodeSet set) {
+            symbols_[c] = set;
+        }
+
+      private:
+        std::unordered_map<UChar32, UnicodeSet> symbols_;
+    };
+    TestSymbolTable symbols;
+    symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
+    symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
+    symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
+    struct TestCase {
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+        std::u16string_view expectedRegeneratedPattern;
+    };
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+        std::vector<TestCase>{
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+            // Substitution of lookupMatcher symbols takes place after unescaping.
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+            // It does not take place in string literals.
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
+            {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
+            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+        }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
+    // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
+    // change it unknowingly.
+    symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode));
+    symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode));
+    // This one is never used, except if escaped.
+    symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode));
+    symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode));
+    symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
+    symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
+    symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+        std::vector<TestCase>{
+            {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
+            // The hyphen no longer works as set difference.
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
+            {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
+            // String literals no longer work.
+            {uR"([!-/{0}])", U_ZERO_ERROR,
+            u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
+            u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
+            // The ampersand no longer works as set difference.
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
+            u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
+            // Complementing still works.
+            {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])",
+            uR"([\u0001-\U0010FFFF])"},
+            // ^ elsewhere becomes a symbol rather than a syntax error.
+            {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])",
+            uR"([\u0000{circumflexAccent}{hyphenMinus}])"},
+            // Opening brackets still work.
+            {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"},
+            // The only way to access the [ symbol is via escaping.
+            {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
+            // Anchors are gone.
+            {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
+        }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
+    // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
+    // constructor returns an error but not an empty set. Don’t do that.
+    symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
+        std::vector<TestCase>{
+            {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+            {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+        }) {
+        UnicodeString actual;
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
+        if (errorCode != expectedErrorCode) {
+            errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " +
+                  u_errorName(errorCode));
+        }
+        if (set.toPattern(actual) != expectedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern +
+                  ", got " + actual);
+        }
+        if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) {
+            errln(u"UnicodeSet(R\"(" + expression +
+                  u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
+                  ", got " + actual);
+        }
+    }
 }
 
 void UnicodeSetTest::TestSurrogate() {
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 2ac22ba72e62..32abf828a30a 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest {
     void TestInvalidCodePoint();
 
     void TestSymbolTable();
+    void TestLookupSymbolTable();
 
     void TestSurrogate();
 

From d5e73a8a62a7525777b60160e2982b7643936166 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 18:11:33 +0200
Subject: [PATCH 33/56] ICU-23179 Test the exact sequence of lookups

---
 icu4c/source/test/intltest/usettest.cpp | 136 +++++++++++++++++++-----
 1 file changed, 110 insertions(+), 26 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index a5c249d4da37..5d1cf77247b7 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -15,6 +15,7 @@
 #include <string.h>
 
 #include <array>
+#include <map>
 #include <string_view>
 #include <unordered_map>
 
@@ -29,9 +30,11 @@
 #include "unicode/symtable.h"
 #include "unicode/utf8.h"
 #include "unicode/utf16.h"
+#include "unicode/utfiterator.h"
 #include "unicode/uversion.h"
 #include "cmemory.h"
 #include "hash.h"
+#include <variant>
 
 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
     if (U_FAILURE(status)) { \
@@ -1880,53 +1883,119 @@ void UnicodeSetTest::TestSymbolTable() {
 
 void UnicodeSetTest::TestLookupSymbolTable() {
     UErrorCode errorCode = U_ZERO_ERROR;
+    struct TestCase {
+        struct Variable {
+            std::u16string_view name;
+            std::u16string_view value;
+        };
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+        std::u16string_view expectedRegeneratedPattern;
+        // Hyrum’s law at work: Some users (RBBI) depend on the sequencing of `lookup` and
+        // `lookupMatcher` calls, so we test that.
+        std::vector<std::variant<UnicodeString, UChar32>> expectedLookups;
+        // Variables for `lookup`.
+        std::vector<Variable> variables;
+    };
     class TestSymbolTable : public SymbolTable {
       public:
-        const UnicodeString *lookup(const UnicodeString &) const override {
-            return nullptr;
+        const UnicodeString *lookup(const UnicodeString &name) const override {
+            auto it = variables_.find(name);
+            lookupTrace_.push_back(name);
+            return it == variables_.end() ? nullptr : &it->second;
         }
 
         const UnicodeFunctor *lookupMatcher(UChar32 c) const override {
+            lookupTrace_.push_back(c);
             return symbols_.find(c) != symbols_.end() ? &symbols_.at(c)
                                                                     : nullptr;
         }
 
-        virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &,
-                                             int32_t) const override {
-            return u"";
+        virtual UnicodeString parseReference(const UnicodeString &text, ParsePosition &pos,
+                                                 int32_t limit) const override {
+                const auto limitedText = std::u16string_view(text).substr(pos.getIndex(), limit);
+                for (auto codeUnits : header::utfStringCodePoints<UChar32, UTF_BEHAVIOR_FFFD>(limitedText)) {
+                    if (!u_isIDPart(codeUnits.codePoint())) {
+                        pos.setIndex(pos.getIndex() + (codeUnits.begin() - limitedText.begin()));
+                        // TODO(egg): In C++20, this could use the two-iterator constructor of
+                        // std::u16string_view.
+                        return limitedText.substr(0, codeUnits.begin() - limitedText.begin());
+                    }
+                }
+                pos.setIndex(limit);
+                return limitedText;
         }
 
         void add(UChar32 c, UnicodeSet set) {
             symbols_[c] = set;
         }
 
+        void setVariables(const std::vector<TestCase::Variable>& variables) {
+            for (const auto &[name, value] : variables) {
+                variables_[name] = value;
+            }
+        }
+
+        const std::vector<std::variant<UnicodeString, UChar32>>& getLookupTrace() const {
+            return lookupTrace_;
+        }
+
+        void clearLookupTrace() {
+            lookupTrace_.clear();
+        }
+
       private:
         std::unordered_map<UChar32, UnicodeSet> symbols_;
+        std::map<UnicodeString, UnicodeString> variables_;
+        mutable std::vector<std::variant<UnicodeString, UChar32>> lookupTrace_;
     };
     TestSymbolTable symbols;
     symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
     symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
     symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
-    struct TestCase {
-        std::u16string_view expression;
-        UErrorCode expectedErrorCode;
-        std::u16string_view expectedPattern;
-        std::u16string_view expectedRegeneratedPattern;
-    };
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
-            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
-            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
-            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
+            // A call to lookupMatcher with the first character of the content of a variable happens
+            // immediately after a corresponding call to lookup, although we may lookup the variable
+            // several times before we call lookupMatcher.
+            {u"[0-$one]",
+            U_ZERO_ERROR,
+            u"[[a-z]-[bc]]",
+            u"[ad-z]",
+            {u'0', u'-', u"one", u"one", u'1', u']'},
+            {{u"zero", u"0"}, {u"one", u"1"}}},
+            {u"[$zero-$one]",
+            U_ZERO_ERROR,
+            u"[[a-z]-[bc]]",
+            u"[ad-z]",
+            {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'},
+            {{u"zero", u"0"}, {u"one", u"1"}}},
+            // If the variable expands to multiple symbols, only the first one is sequenced right after
+            // the variable lookup.
+            {u"[$ten]",
+            U_ZERO_ERROR,
+            u"[[bc][a-z]]",
+            u"[a-z]",
+            {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'},
+            {{u"ten", u"10"}}},
             // Substitution of lookupMatcher symbols takes place after unescaping.
-            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
             // It does not take place in string literals.
-            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
-            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
-            {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
-            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
-            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}},
+            {uR"([ 21 ])",
+            U_ZERO_ERROR,
+            u"[[: Co :][bc]]",
+            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]",
+            {u'2', u'1', u']'}},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}},
         }) {
+        symbols.setVariables(variables);
+        symbols.clearLookupTrace();
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
@@ -1943,6 +2012,21 @@ void UnicodeSetTest::TestLookupSymbolTable() {
                   u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
                   ", got " + actual);
         }
+        if (symbols.getLookupTrace() != expectedLookups) {
+            UnicodeString expected;
+            UnicodeString actual;
+            for (const auto &l : expectedLookups) {
+                expected += std::holds_alternative<UChar32>(l)
+                                ? (u"u'" + UnicodeString(std::get<UChar32>(l)) + u"', ")
+                                : u"u\"" + std::get<UnicodeString>(l) + u"\", ";
+            }
+            for (const auto &l : symbols.getLookupTrace()) {
+                actual += std::holds_alternative<UChar32>(l)
+                              ? (u"u'" + UnicodeString(std::get<UChar32>(l)) + u"', ")
+                              : u"u\"" + std::get<UnicodeString>(l) + u"\", ";
+            }
+            errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual   : " + actual);
+        }
     }
     // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
     // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
@@ -1955,8 +2039,8 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
     symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
     symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
             {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
             // The hyphen no longer works as set difference.
@@ -2002,8 +2086,8 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
     // constructor returns an error but not an empty set. Don’t do that.
     symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
         }) {

From ef59acb8ef05a30e4b4e3396b59d53f413ca58e0 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 18 Aug 2025 18:11:33 +0200
Subject: [PATCH 34/56] ICU-23179 Test the exact sequence of lookups

---
 icu4c/source/test/intltest/usettest.cpp | 140 +++++++++++++++++++-----
 1 file changed, 114 insertions(+), 26 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index a5c249d4da37..6b5fba510286 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -15,6 +15,7 @@
 #include <string.h>
 
 #include <array>
+#include <map>
 #include <string_view>
 #include <unordered_map>
 
@@ -29,9 +30,11 @@
 #include "unicode/symtable.h"
 #include "unicode/utf8.h"
 #include "unicode/utf16.h"
+#include "unicode/utfiterator.h"
 #include "unicode/uversion.h"
 #include "cmemory.h"
 #include "hash.h"
+#include <variant>
 
 #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \
     if (U_FAILURE(status)) { \
@@ -1880,53 +1883,122 @@ void UnicodeSetTest::TestSymbolTable() {
 
 void UnicodeSetTest::TestLookupSymbolTable() {
     UErrorCode errorCode = U_ZERO_ERROR;
+    // We let `variables` be empty by default in the test cases below.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+    struct TestCase {
+        struct Variable {
+            std::u16string_view name;
+            std::u16string_view value;
+        };
+        std::u16string_view expression;
+        UErrorCode expectedErrorCode;
+        std::u16string_view expectedPattern;
+        std::u16string_view expectedRegeneratedPattern;
+        // Hyrum’s law at work: Some users (RBBI) depend on the sequencing of `lookup` and
+        // `lookupMatcher` calls, so we test that.
+        std::vector<std::variant<UnicodeString, UChar32>> expectedLookups;
+        // Variables for `lookup`.
+        std::vector<Variable> variables;
+    };
     class TestSymbolTable : public SymbolTable {
       public:
-        const UnicodeString *lookup(const UnicodeString &) const override {
-            return nullptr;
+        const UnicodeString *lookup(const UnicodeString &name) const override {
+            auto it = variables_.find(name);
+            lookupTrace_.push_back(name);
+            return it == variables_.end() ? nullptr : &it->second;
         }
 
         const UnicodeFunctor *lookupMatcher(UChar32 c) const override {
+            lookupTrace_.push_back(c);
             return symbols_.find(c) != symbols_.end() ? &symbols_.at(c)
                                                                     : nullptr;
         }
 
-        virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &,
-                                             int32_t) const override {
-            return u"";
+        virtual UnicodeString parseReference(const UnicodeString &text, ParsePosition &pos,
+                                                 int32_t limit) const override {
+                const auto limitedText = std::u16string_view(text).substr(pos.getIndex(), limit);
+                for (auto codeUnits : header::utfStringCodePoints<UChar32, UTF_BEHAVIOR_FFFD>(limitedText)) {
+                    if (!u_isIDPart(codeUnits.codePoint())) {
+                        pos.setIndex(pos.getIndex() + (codeUnits.begin() - limitedText.begin()));
+                        // TODO(egg): In C++20, this could use the two-iterator constructor of
+                        // std::u16string_view.
+                        return limitedText.substr(0, codeUnits.begin() - limitedText.begin());
+                    }
+                }
+                pos.setIndex(limit);
+                return limitedText;
         }
 
         void add(UChar32 c, UnicodeSet set) {
             symbols_[c] = set;
         }
 
+        void setVariables(const std::vector<TestCase::Variable>& variables) {
+            for (const auto &[name, value] : variables) {
+                variables_[name] = value;
+            }
+        }
+
+        const std::vector<std::variant<UnicodeString, UChar32>>& getLookupTrace() const {
+            return lookupTrace_;
+        }
+
+        void clearLookupTrace() {
+            lookupTrace_.clear();
+        }
+
       private:
         std::unordered_map<UChar32, UnicodeSet> symbols_;
+        std::map<UnicodeString, UnicodeString> variables_;
+        mutable std::vector<std::variant<UnicodeString, UChar32>> lookupTrace_;
     };
     TestSymbolTable symbols;
     symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode));
     symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode));
     symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode));
-    struct TestCase {
-        std::u16string_view expression;
-        UErrorCode expectedErrorCode;
-        std::u16string_view expectedPattern;
-        std::u16string_view expectedRegeneratedPattern;
-    };
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
-            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
-            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"},
-            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"},
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
+            {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}},
+            {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}},
+            {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
+            // A call to lookupMatcher with the first character of the content of a variable happens
+            // immediately after a corresponding call to lookup, although we may lookup the variable
+            // several times before we call lookupMatcher.
+            {u"[0-$one]",
+            U_ZERO_ERROR,
+            u"[[a-z]-[bc]]",
+            u"[ad-z]",
+            {u'0', u'-', u"one", u"one", u'1', u']'},
+            {{u"zero", u"0"}, {u"one", u"1"}}},
+            {u"[$zero-$one]",
+            U_ZERO_ERROR,
+            u"[[a-z]-[bc]]",
+            u"[ad-z]",
+            {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'},
+            {{u"zero", u"0"}, {u"one", u"1"}}},
+            // If the variable expands to multiple symbols, only the first one is sequenced right after
+            // the variable lookup.
+            {u"[$ten]",
+            U_ZERO_ERROR,
+            u"[[bc][a-z]]",
+            u"[a-z]",
+            {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'},
+            {{u"ten", u"10"}}},
             // Substitution of lookupMatcher symbols takes place after unescaping.
-            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"},
+            {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},
             // It does not take place in string literals.
-            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"},
-            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"},
-            {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]",
-            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
-            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"},
+            {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}},
+            {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}},
+            {uR"([ 21 ])",
+            U_ZERO_ERROR,
+            u"[[: Co :][bc]]",
+            u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]",
+            {u'2', u'1', u']'}},
+            {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}},
         }) {
+        symbols.setVariables(variables);
+        symbols.clearLookupTrace();
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode);
@@ -1943,6 +2015,21 @@ void UnicodeSetTest::TestLookupSymbolTable() {
                   u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern +
                   ", got " + actual);
         }
+        if (symbols.getLookupTrace() != expectedLookups) {
+            UnicodeString expected;
+            UnicodeString actual;
+            for (const auto &l : expectedLookups) {
+                expected += std::holds_alternative<UChar32>(l)
+                                ? (u"u'" + UnicodeString(std::get<UChar32>(l)) + u"', ")
+                                : u"u\"" + std::get<UnicodeString>(l) + u"\", ";
+            }
+            for (const auto &l : symbols.getLookupTrace()) {
+                actual += std::holds_alternative<UChar32>(l)
+                              ? (u"u'" + UnicodeString(std::get<UChar32>(l)) + u"', ")
+                              : u"u\"" + std::get<UnicodeString>(l) + u"\", ";
+            }
+            errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual   : " + actual);
+        }
     }
     // Test what happens when we define syntax characters as symbols.  It is an extraordinarily bad idea
     // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
@@ -1955,8 +2042,8 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
     symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
     symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
             {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
             {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"},
             // The hyphen no longer works as set difference.
@@ -2002,8 +2089,8 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
     // constructor returns an error but not an empty set. Don’t do that.
     symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode));
-    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] :
-        std::vector<TestCase>{
+    for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
+                      expectedLookups, variables] : std::vector<TestCase>{
             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
         }) {
@@ -2024,6 +2111,7 @@ void UnicodeSetTest::TestLookupSymbolTable() {
                   ", got " + actual);
         }
     }
+#pragma GCC diagnostic pop
 }
 
 void UnicodeSetTest::TestSurrogate() {

From 770c9aa9f40a25235b988180ae543e1d15123a53 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 20 Aug 2025 16:06:34 +0200
Subject: [PATCH 35/56] Ignore warnings

---
 icu4c/source/test/intltest/usettest.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 5d1cf77247b7..6b5fba510286 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -1883,6 +1883,9 @@ void UnicodeSetTest::TestSymbolTable() {
 
 void UnicodeSetTest::TestLookupSymbolTable() {
     UErrorCode errorCode = U_ZERO_ERROR;
+    // We let `variables` be empty by default in the test cases below.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
     struct TestCase {
         struct Variable {
             std::u16string_view name;
@@ -2108,6 +2111,7 @@ void UnicodeSetTest::TestLookupSymbolTable() {
                   ", got " + actual);
         }
     }
+#pragma GCC diagnostic pop
 }
 
 void UnicodeSetTest::TestSurrogate() {

From 110a54d78b0a0bd1d2528556a0606bf7aec6285f Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 21 Aug 2025 17:15:25 +0200
Subject: [PATCH 36/56] Abstract away the getPos/next/setPos/lookupMatcher
 dance

---
 icu4c/source/common/unicode/uniset.h   |  26 +-
 icu4c/source/common/uniset_closure.cpp |   2 +-
 icu4c/source/common/uniset_props.cpp   | 430 ++++++++++++++-----------
 3 files changed, 255 insertions(+), 203 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 2d73df2fcdac..d805fd9e8156 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1697,6 +1697,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                                  UErrorCode& status);
 
     void applyPattern(const UnicodeString &pattern,
+                      const ParsePosition& parsePosition,
                       RuleCharacterIterator &chars,
                       const SymbolTable *symbols,
                       UnicodeString &rebuiltPat,
@@ -1709,18 +1710,16 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
     // applied).  They add to *this the elements of the set that the parsed construct represents.
     // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
 
-    void parseUnicodeSet(const UnicodeString &pattern,
-                         RuleCharacterIterator &chars,
-                         const SymbolTable *symbols,
+    class Lexer;
+
+    void parseUnicodeSet(Lexer &lexer,
                          UnicodeString &rebuiltPat,
                          uint32_t options,
                          UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                          int32_t depth,
                          UErrorCode &ec);
 
-    void parseUnion(const UnicodeString &pattern,
-                    RuleCharacterIterator &chars,
-                    const SymbolTable *symbols,
+    void parseUnion(Lexer &lexer,
                     UnicodeString &rebuiltPat,
                     uint32_t options,
                     UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
@@ -1728,30 +1727,23 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                     bool &containsRestrictions,
                     UErrorCode &ec);
 
-    void parseTerm(const UnicodeString &pattern,
-                   RuleCharacterIterator &chars,
-                   const SymbolTable *symbols,
+    void parseTerm(Lexer &lexer,
                    UnicodeString &rebuiltPat,
                    uint32_t options,
                    UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                    int32_t depth,
-                   bool &containsRestriction,
+                   bool &containsRestrictions,
                    UErrorCode &ec);
 
-    void parseRestriction(const UnicodeString &pattern,
-                          RuleCharacterIterator &chars,
-                          const SymbolTable *symbols,
+    void parseRestriction(Lexer &lexer,
                           UnicodeString &rebuiltPat,
                           uint32_t options,
                           UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                           int32_t depth,
                           UErrorCode &ec);
 
-    void parseElements(const UnicodeString &pattern,
-                       RuleCharacterIterator &chars,
-                       const SymbolTable *symbols,
+    void parseElements(Lexer &lexer,
                        UnicodeString &rebuiltPat,
-                       uint32_t options,
                        UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                        int32_t depth,
                        UErrorCode &ec);
diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp
index 05e9b0a37e04..2cd3e01ee324 100644
--- a/icu4c/source/common/uniset_closure.cpp
+++ b/icu4c/source/common/uniset_closure.cpp
@@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(pattern, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
+    applyPattern(pattern, pos, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status);
     if (U_FAILURE(status)) return *this;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 8c4b13f18e71..46401a273b4e 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -44,6 +44,7 @@
 #include "umutex.h"
 #include "uassert.h"
 #include "hash.h"
+#include <optional>
 
 U_NAMESPACE_USE
 
@@ -196,7 +197,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
     // _applyPattern calls add() etc., which set pat to empty.
     UnicodeString rebuiltPat;
     RuleCharacterIterator chars(pattern, symbols, pos);
-    applyPattern(pattern, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
+    applyPattern(pattern, pos, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
     if (U_FAILURE(status)) return;
     if (chars.inVariable()) {
         // syntaxError(chars, "Extra chars in variable value");
@@ -220,42 +221,164 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
 // Implementation: Pattern parsing
 //----------------------------------------------------------------
 
-namespace {
+class UnicodeSet::Lexer {
+  public:
+    Lexer(const UnicodeString &pattern,
+          const ParsePosition &parsePosition,
+          RuleCharacterIterator &chars,
+          uint32_t unicodeSetOptions,
+          const SymbolTable *const symbols)
+        : pattern_(pattern), parsePosition_(parsePosition), chars_(chars),
+          charsOptions_(RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES |
+                        ((unicodeSetOptions & USET_IGNORE_SPACE) != 0
+                             ? RuleCharacterIterator::SKIP_WHITESPACE
+                             : 0)),
+          symbols_(symbols) {}
+
+    class Lookahead {
+      public:
+        bool isUnescaped(UChar32 codePoint) const {
+            return !escaped_ && codePoint_ == codePoint;
+        }
 
-/**
- * A small all-inline class to manage a UnicodeSet pointer.  Add
- * operator->() etc. as needed.
- */
-class UnicodeSetPointer {
-    UnicodeSet* p;
-public:
-    inline UnicodeSetPointer() : p(nullptr) {}
-    inline ~UnicodeSetPointer() { delete p; }
-    inline UnicodeSet* pointer() { return p; }
-    inline UBool allocate() {
-        if (p == nullptr) {
-            p = new UnicodeSet();
+        bool isUnescapedNotStandIn(UChar32 codePoint) {
+            return isUnescaped(codePoint) && standIn() == nullptr;
+        }
+
+        void moveAfter() {
+            lexer_.chars_.setPos(after_);
+            lexer_.ahead_.reset();
+        }
+
+        bool acceptUnescapedNotStandIn(UChar32 codePoint) {
+            if (isUnescapedNotStandIn(codePoint)) {
+                moveAfter();
+                return true;
+            }
+            return false;
+        }
+
+        bool acceptUnescaped(UChar32 codePoint) {
+            if (isUnescaped(codePoint)) {
+                moveAfter();
+                return true;
+            }
+            return false;
+        }
+
+        UChar32 codePoint(UErrorCode &errorCode) const {
+            if (!U_FAILURE(errorCode)) {
+                errorCode = errorCode;
+            }
+            return codePoint_;
+        }
+
+        bool escaped() const {
+            return escaped_;
         }
-        return p != nullptr;
+
+        const UnicodeSet *standIn() {
+            if (!standIn_.has_value()) {
+                if (lexer_.symbols_ == nullptr) {
+                    standIn_ = nullptr;
+                } else {
+                    standIn_ =
+                        dynamic_cast<const UnicodeSet *>(lexer_.symbols_->lookupMatcher(codePoint_));
+                }
+            }
+            return *standIn_;
+        };
+
+        // Some parts of the grammar need two tokens of lookahead.  The second lookahead is not cached.
+        Lookahead oneMore() {
+            return oneMore(lexer_.charsOptions_);
+        }
+
+        Lookahead oneMore(int32_t charsOptions) {
+            RuleCharacterIterator::Pos before;
+            lexer_.chars_.getPos(before);
+            lexer_.chars_.setPos(after_);
+            auto const result = Lookahead(lexer_, lexer_.chars_, charsOptions);
+            lexer_.chars_.setPos(before);
+            return result;
+        }
+
+        Lookahead(Lexer &lexer, RuleCharacterIterator &chars, int32_t charsOptions)
+            : lexer_(lexer) {
+            RuleCharacterIterator::Pos before;
+            chars.getPos(before);
+            codePoint_ = chars.next(charsOptions, escaped_, errorCode_);
+            chars.getPos(after_);
+            chars.setPos(before);
+        }
+
+      private:
+        Lexer &lexer_;
+        RuleCharacterIterator::Pos after_;
+        UErrorCode errorCode_;
+        UChar32 codePoint_;
+        UBool escaped_;
+        // `std::nullopt` if we have not yet called `lookupMatcher`, otherwise the result of
+        // `lookupMatcher` (which may be `nullptr`).
+        std::optional<const UnicodeSet *> standIn_;
+
+        friend class Lexer;
+    };
+
+    UnicodeString getPositionForDebugging() const {
+        return pattern_.tempSubString(0, parsePosition_.getIndex()) + u"☞" +
+               pattern_.tempSubString(parsePosition_.getIndex(), 60);
     }
-};
 
-constexpr int32_t MAX_DEPTH = 100;
+    Lookahead &lookahead() {
+        if (!ahead_.has_value()) {
+            ahead_.emplace(*this, chars_, charsOptions_);
+        }
+        return *ahead_;
+    }
 
-constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) {
-    int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES;
-    if ((unicodeSetOptions & USET_IGNORE_SPACE) != 0) {
-        opts |= RuleCharacterIterator::SKIP_WHITESPACE;
+    bool resemblesPropertyPattern() {
+        Lookahead first =
+            Lookahead(*this, chars_, charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES);
+        if (first.codePoint_ != u'[' && first.codePoint_ != u'\\') {
+            return false;
+        }
+        Lookahead second = first.oneMore(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                           RuleCharacterIterator::SKIP_WHITESPACE));
+        return (first.codePoint_ == u'[' && second.codePoint_ == ':') ||
+               (first.codePoint_ == u'\\' &&
+                (second.codePoint_ == u'p' || second.codePoint_ == u'P' || second.codePoint_ == u'N'));
     }
-    return opts;
-}
 
-const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar32 c) {
-    if (symbols == nullptr) {
-      return nullptr;
+    // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so
+    // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to
+    // `lookahead`.  The resulting `RuleCharacterIterator` must not be used once `lookahead` has been
+    // called.
+    RuleCharacterIterator &getCharacterIterator() {
+        ahead_.reset();
+        return chars_;
     }
-    return dynamic_cast<const UnicodeSet *>(symbols->lookupMatcher(c));
-}
+
+    int32_t charsOptions() {
+        return charsOptions_;
+    }
+
+    bool atEnd() const {
+        return chars_.atEnd();
+    }
+
+  private:
+    const UnicodeString &pattern_;
+    const ParsePosition &parsePosition_;
+    RuleCharacterIterator &chars_;
+    const int32_t charsOptions_;
+    const SymbolTable *const symbols_;
+    std::optional<Lookahead> ahead_;
+};
+
+namespace {
+
+constexpr int32_t MAX_DEPTH = 100;
 
 #if 0
 #define U_UNICODESET_TRACE(...)                                                                         \
@@ -314,24 +437,16 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar
             return;                                                                                     \
         }                                                                                               \
     } while (false)
-#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec)                               \
+#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec)                               \
     do {                                                                                                \
-    constexpr std::string_view functionName = __func__;                                             \
-        static_assert(functionName.substr(0, 5) == "parse");\
+        constexpr std::string_view functionName = __func__;                                             \
+        static_assert(functionName.substr(0, 5) == "parse");                                            \
         std::string actualUTF8;                                                                         \
-        UnicodeString ahead;                                                                            \
-        std::string aheadUTF8;                                                                          \
-        std::string behindUTF8;                                                                          \
-        (chars).lookahead(ahead); \
-        printf("*** Expected %s, got '%s' %s☜%s\n", (expected),                                            \
+        std::string contextUTF8;                                                                        \
+        printf("*** Expected %s, got '%s' %s\n", (expected),                                            \
                UnicodeString(actual).toUTF8String(actualUTF8).c_str(),                                  \
-               pattern.tempSubString(0, pattern.length() - ahead.length())                              \
-                   .toUTF8String(behindUTF8)                                                            \
-                   .c_str(),                                                                            \
-               pattern.tempSubString(pattern.length() - ahead.length(), 60)                              \
-                   .toUTF8String(aheadUTF8)                                                             \
-                   .c_str());                           \
-        printf("--- in %s l. %d\n", __func__ + 5, __LINE__);                                                \
+               lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str());                      \
+        printf("--- in %s l. %d\n", __func__ + 5, __LINE__);                                            \
         (ec) = U_MALFORMED_SET;                                                                         \
         return;                                                                                         \
     } while (false)
@@ -342,6 +457,7 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar
  * Parse the pattern from the given RuleCharacterIterator.  The
  * iterator is advanced over the parsed pattern.
  * @param pattern The pattern, only used by debug traces.
+ * @param parsePosition The ParsePosition underlying chars, only used by debug traces.
  * @param chars iterator over the pattern characters.  Upon return
  * it will be advanced to the first character after the parsed
  * pattern, or the end of the iteration if all characters are
@@ -355,6 +471,7 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar
  */
 
 void UnicodeSet::applyPattern(const UnicodeString &pattern,
+                              const ParsePosition &parsePosition,
                               RuleCharacterIterator &chars,
                               const SymbolTable *symbols,
                               UnicodeString &rebuiltPat,
@@ -362,22 +479,22 @@ void UnicodeSet::applyPattern(const UnicodeString &pattern,
                               UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                               UErrorCode &ec) {
     if (U_FAILURE(ec)) return;
-    parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
+    Lexer lexer(pattern, parsePosition, chars, options, symbols);
+    parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
 }
 
-void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
-                                 RuleCharacterIterator &chars,
-                                 const SymbolTable* symbols,
+void UnicodeSet::parseUnicodeSet(Lexer &lexer,
                                  UnicodeString& rebuiltPat,
                                  uint32_t options,
                                  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
-                                 int32_t depth, UErrorCode &ec) {
+                                 int32_t depth,
+                                 UErrorCode &ec) {
     clear();
     U_UNICODESET_TRACE();
 
     if (depth > MAX_DEPTH) {
         U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(),
-                                             ("depth = " + std::to_string(depth)).c_str(), chars, ec);
+                                             ("depth = " + std::to_string(depth)).c_str(), lexer, ec);
     }
 
     bool isComplement = false;
@@ -388,17 +505,16 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
     bool preserveSyntaxInPattern = false;
     // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc.
     UnicodeString prettyPrintedPattern;
-    if (resemblesPropertyPattern(chars, charsOptions(options))) {
+    if (lexer.resemblesPropertyPattern()) {
         // UnicodeSet ::= property-query | named-element
         U_UNICODESET_TRACE("property-query | named-element");
-        chars.skipIgnored(charsOptions(options));
+        lexer.getCharacterIterator().skipIgnored(lexer.charsOptions());
         UnicodeSet propertyQuery;
-        propertyQuery.applyPropertyPattern(chars, prettyPrintedPattern, ec);
+        propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
         addAll(propertyQuery);
         preserveSyntaxInPattern = true;
     } else {
-        UBool escaped = false;
         // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
         // UnicodeSet ::=                [   Union ]
         //              | Complement ::= [ ^ Union ]
@@ -407,37 +523,29 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
         // Where a MatcherSymbol may be a character or an escape.
         // Strings that would match MatcherSymbol effectively get removed from
         // all other terminals of the grammar, except [.
-        UChar32 c = chars.next(charsOptions(options), escaped, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (!escaped && c == u'[') {
+        if (lexer.lookahead().acceptUnescaped(u'[')) {
             prettyPrintedPattern.append(u'[');
-            RuleCharacterIterator::Pos afterBracket;
-            chars.getPos(afterBracket);
-            c = chars.next(charsOptions(options), escaped, ec);
-            U_UNICODESET_RETURN_IF_ERROR(ec);
-            if (!escaped && c == u'^') {
+            if (lexer.lookahead().acceptUnescaped(u'^')) {
                 prettyPrintedPattern.append(u'^');
                 isComplement = true;
-            } else {
-                chars.setPos(afterBracket);
             }
-            parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth,
+            parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth,
                        /*containsRestrictions=*/preserveSyntaxInPattern, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
-            c = chars.next(charsOptions(options), escaped, ec);
-            U_UNICODESET_RETURN_IF_ERROR(ec);
-            if (escaped || c != u']') {
-                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec);
+            if (!lexer.lookahead().acceptUnescaped(u']')) {
+                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec);
             }
             prettyPrintedPattern.append(u']');
         } else {
-            const UnicodeSet *set = getMatcherSymbol(symbols, c);
+            const UnicodeSet *set = lexer.lookahead().standIn();
             if (set != nullptr) {
                 *this = *set;
                 this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false);
+                lexer.lookahead().moveAfter();
                 return;
             }
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec);
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)",
+                                                 lexer.lookahead().codePoint(ec), lexer, ec);
         }
     }
 
@@ -460,9 +568,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern,
     }
 }
 
-void UnicodeSet::parseUnion(const UnicodeString &pattern,
-                            RuleCharacterIterator &chars,
-                            const SymbolTable *symbols,
+void UnicodeSet::parseUnion(Lexer &lexer,
                             UnicodeString &rebuiltPat,
                             uint32_t options,
                             UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
@@ -470,64 +576,47 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern,
                             bool &containsRestrictions,
                             UErrorCode &ec) {
     U_UNICODESET_TRACE();
-    UBool escaped = false;
-    RuleCharacterIterator::Pos position;
-    chars.getPos(position);
     // Union ::= Terms
     //         | UnescapedHyphenMinus Terms
     //         | Terms UnescapedHyphenMinus
     //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
     // Terms ::= ""
     //         | Terms Term
-    UChar32 c = chars.next(charsOptions(options), escaped, ec);
-    U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (!escaped && c == u'-' && getMatcherSymbol(symbols, c)) {
+    if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) {
         add(u'-');
         // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
         // final one, for consistency with older ICU behaviour.
         rebuiltPat.append(u"\\-");
-    } else {
-        chars.setPos(position);
     }
-    while (!chars.atEnd()) {
-        chars.getPos(position);
-        c = chars.next(charsOptions(options), escaped, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (getMatcherSymbol(symbols, c) == nullptr) {
-            if (!escaped && c == u'-') {
-                // We can be here on the first iteration: [--] is allowed by the
-                // grammar and by the old parser.
-                rebuiltPat.append(u'-');
-                add(u'-');
+    while (!lexer.atEnd()) {
+        if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) {
+            // We can be here on the first iteration: [--] is allowed by the
+            // grammar and by the old parser.
+            rebuiltPat.append(u'-');
+            add(u'-');
+            return;
+        } else if (lexer.lookahead().isUnescapedNotStandIn(u'$')) {
+            Lexer::Lookahead afterDollar = lexer.lookahead().oneMore();
+            if (afterDollar.isUnescaped(u']')) {
+                // ICU extensions: A $ is allowed as a literal-element.
+                // A Term at the end of a Union consisting of a single $ is an anchor.
+                rebuiltPat.append(u'$');
+                // Consume the dollar.
+                lexer.lookahead().moveAfter();
+                add(U_ETHER);
+                containsRestrictions = true;
                 return;
-            } else if (!escaped && c == u'$') {
-                RuleCharacterIterator::Pos afterDollar;
-                chars.getPos(afterDollar);
-                c = chars.next(charsOptions(options), escaped, ec);
-                if (!escaped && c == u']') {
-                    // ICU extensions: A $ is allowed as a literal-element.
-                    // A Term at the end of a Union consisting of a single $ is an anchor.
-                    rebuiltPat.append(u'$');
-                    chars.setPos(afterDollar);
-                    add(U_ETHER);
-                    containsRestrictions = true;
-                    return;
-                }
             }
         }
-        chars.setPos(position);
-        if (!escaped && c == ']' && getMatcherSymbol(symbols, c) == nullptr) {
+        if (lexer.lookahead().isUnescapedNotStandIn(u']')) {
             return;
         }
-        parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions,
-                  ec);
+        parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
-void UnicodeSet::parseTerm(const UnicodeString &pattern,
-                           RuleCharacterIterator &chars,
-                           const SymbolTable *symbols,
+void UnicodeSet::parseTerm(Lexer &lexer,
                            UnicodeString &rebuiltPat,
                            uint32_t options,
                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
@@ -535,40 +624,32 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern,
                            bool &containsRestriction,
                            UErrorCode &ec) {
     U_UNICODESET_TRACE();
-    UBool escaped = false;
-    RuleCharacterIterator::Pos termStart;
-    chars.getPos(termStart);
     // Term ::= Elements
     //        | Restriction
-    const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
-    chars.setPos(termStart);
-    if (getMatcherSymbol(symbols, ahead) != nullptr || !escaped && ahead == '[' ||
-        resemblesPropertyPattern(chars, charsOptions(options))) {
+    if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') ||
+        lexer.resemblesPropertyPattern()) {
         containsRestriction = true;
-        parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     } else {
-        parseElements(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec);
+        parseElements(lexer, rebuiltPat, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }
 
-void UnicodeSet::parseRestriction(const UnicodeString &pattern,
-                                  RuleCharacterIterator &chars,
-                                  const SymbolTable *symbols,
+void UnicodeSet::parseRestriction(Lexer &lexer,
                                   UnicodeString &rebuiltPat,
                                   uint32_t options,
                                   UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
-                                  int32_t depth, UErrorCode &ec) {
+                                  int32_t depth,
+                                  UErrorCode &ec) {
     U_UNICODESET_TRACE();
-    UBool escaped = false;
     // Restriction ::= UnicodeSet
     //               | Intersection ::= Restriction & UnicodeSet
     //               | Difference   ::= Restriction - UnicodeSet
     // Start by parsing the first UnicodeSet.
     UnicodeSet leftHandSide;
-    leftHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1,
-                                 ec);
+    leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
     addAll(leftHandSide);
     U_UNICODESET_RETURN_IF_ERROR(ec);
     // Now keep looking for an operator that would continue the Restriction.
@@ -576,55 +657,41 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern,
     // return.
     for (;;) {
         RuleCharacterIterator::Pos beforeOperator;
-        chars.getPos(beforeOperator);
-        const UChar32 op = chars.next(charsOptions(options), escaped, ec);
-        U_UNICODESET_RETURN_IF_ERROR(ec);
-        if (getMatcherSymbol(symbols, op)) {
+        if (lexer.lookahead().standIn() != nullptr) {
             // Not an operator, end of the Restriction.
-            chars.setPos(beforeOperator);
             return;
         }
-        if (!escaped && op == u'&') {
+        if (lexer.lookahead().acceptUnescaped(u'&')) {
             // Intersection ::= Restriction & UnicodeSet
             rebuiltPat.append(u'&');
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
-                                          depth + 1, ec);
+            rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             retainAll(rightHandSide);
-        } else if (!escaped && op == u'-') {
+        } else if (lexer.lookahead().isUnescaped(u'-')) {
             // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
             // of a Difference or an UnescapedHyphenMinus in the enclosing Union.
-            RuleCharacterIterator::Pos afterOperator;
-            chars.getPos(afterOperator);
-            const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
-            U_UNICODESET_RETURN_IF_ERROR(ec);
-            if (!escaped && ahead == u']') {
+            if (lexer.lookahead().oneMore().isUnescaped(u']')) {
                 // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it.
-                chars.setPos(beforeOperator);
                 return;
             }
-            chars.setPos(afterOperator);
+            // Consume the hyphen-minus.
+            lexer.lookahead().moveAfter();
             // Difference ::= Restriction - UnicodeSet
             rebuiltPat.append(u'-');
             UnicodeSet rightHandSide;
-            rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure,
-                                          depth + 1, ec);
+            rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             removeAll(rightHandSide);
         } else {
             // Not an operator, end of the Restriction.
-            chars.setPos(beforeOperator);
             return;
         }
     }
 }
 
-void UnicodeSet::parseElements(const UnicodeString &pattern,
-                               RuleCharacterIterator &chars,
-                               const SymbolTable *symbols,
+void UnicodeSet::parseElements(Lexer &lexer,
                                UnicodeString &rebuiltPat,
-                               uint32_t options,
                                UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                int32_t depth,
                                UErrorCode &ec) {
@@ -636,34 +703,33 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
     //                | escaped-element
     // Element      ::= RangeElement
     //                | string-literal
-    UBool escaped = false;
-    const UChar32 first = chars.next(charsOptions(options), escaped, ec);
+    const UChar32 first = lexer.lookahead().codePoint(ec);
     U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (!escaped) {
+    if (!lexer.lookahead().escaped()) {
         switch (first) {
         case u'-':
         case u'&':
         case u'[':
         case u']':
         case u'^':
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec);
-            // Unescaped '$'
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, lexer, ec);
         case u'{': {
+            lexer.lookahead().moveAfter();
             rebuiltPat.append(u'{');
             UnicodeString string;
-            UChar32 c;
-            while (!chars.atEnd()) {
-                c = chars.next(charsOptions(options), escaped, ec);
-                U_UNICODESET_RETURN_IF_ERROR(ec);
-                if (!escaped && c == u'}') {
+            while (!lexer.atEnd()) {
+                if (lexer.lookahead().acceptUnescaped('}')) {
                     rebuiltPat.append(u'}');
                     add(string);
                     return;
                 }
+                const UChar32 c = lexer.lookahead().codePoint(ec);
+                U_UNICODESET_RETURN_IF_ERROR(ec);
+                lexer.lookahead().moveAfter();
                 _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false);
                 string.append(c);
             }
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec);
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, lexer, ec);
         }
         case u'}':
         case u'$':
@@ -672,35 +738,32 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
             break;
         }
     }
+    lexer.lookahead().moveAfter();
     _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false);
     RuleCharacterIterator::Pos beforeOperator;
-    chars.getPos(beforeOperator);
-    const UChar32 op = chars.next(charsOptions(options), escaped, ec);
-    U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (escaped || op != u'-' || getMatcherSymbol(symbols, op) != nullptr) {
+    if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) {
         // No operator,
         // Elements ::= Element
-        chars.setPos(beforeOperator);
         add(first);
         return;
     }
     // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
     // of a Range or an UnescapedHyphenMinus in the enclosing Union.
-    const UChar32 ahead = chars.next(charsOptions(options), escaped, ec);
-    U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (!escaped && ahead == u']') {
+    if (lexer.lookahead().oneMore().isUnescaped(u']')) {
         // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it.
-        chars.setPos(beforeOperator);
         add(first);
         return;
     }
+    // Consume the hyphen-minus.
+    lexer.lookahead().moveAfter();
     // Elements ::= Range ::= RangeElement - RangeElement
     rebuiltPat.append(u'-');
-    const UChar32 last = ahead;
-    if (getMatcherSymbol(symbols, last) != nullptr) {
-        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec);
+    const UChar32 last = lexer.lookahead().codePoint(ec);
+    U_UNICODESET_RETURN_IF_ERROR(ec);
+    if (lexer.lookahead().standIn() != nullptr) {
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec);
     }
-    if (!escaped) {
+    if (!lexer.lookahead().escaped()) {
         switch (last) {
         case u'-':
         case u'&':
@@ -708,17 +771,13 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
         case u']':
         case u'^':
         case u'{':
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec);
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec);
         case u'$': {
             // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union.
             // This is an extension.
-            RuleCharacterIterator::Pos afterDollar;
-            chars.getPos(afterDollar);
-            UChar32 c = chars.next(charsOptions(options), escaped, ec);
-            chars.setPos(afterDollar);
-            if (!escaped && c == u']') {
-                U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", c, chars,
-                                                     ec);
+            if (lexer.lookahead().oneMore().isUnescaped(u']')) {
+                U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", u']',
+                                                     lexer, ec);
             }
             break;
         }
@@ -728,10 +787,11 @@ void UnicodeSet::parseElements(const UnicodeString &pattern,
             break;
         }
     }
+    lexer.lookahead().moveAfter();
     _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false);
     if (last <= first) {
-        U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range",
-                                 UnicodeString(last) + u"-" + UnicodeString(first), chars, ec);
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR(
+            "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec);
     }
     add(first, last);
     return;

From 34bc05d7bbe6751a49816c8949538969a5c83d95 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 21 Aug 2025 17:16:52 +0200
Subject: [PATCH 37/56] Drop some traces

---
 icu4c/source/common/uniset_props.cpp | 50 ----------------------------
 1 file changed, 50 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 46401a273b4e..0652194441fa 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -380,50 +380,6 @@ namespace {
 
 constexpr int32_t MAX_DEPTH = 100;
 
-#if 0
-#define U_UNICODESET_TRACE(...)                                                                         \
-    struct UnicodeSetParserTrace {                                                                      \
-        char const *const symbol_;                                                                      \
-        int const depth_;                                                                               \
-        const UnicodeSet *const that_;                                                                  \
-        UnicodeSetParserTrace(char const *symbol, int depth, const UnicodeSet *that)                    \
-            : symbol_(symbol), depth_(depth), that_(that) {}                                            \
-        ~UnicodeSetParserTrace() {                                                                      \
-            UnicodeString ahead;                                                                        \
-            std::string aheadUTF8;                                                                      \
-            printf("%s%s\n", std::string(depth_ * 4, ' ').c_str(), symbol_);                            \
-            printf("%s\n", (UnicodeSet(*that_)                                                           \
-                               .complement()                                                            \
-                               .complement()                                                            \
-                               .toPattern(ahead)                                                        \
-                               .toUTF8String(aheadUTF8)                                                 \
-                               .c_str(),""));                                                               \
-        }                                                                                               \
-    };                                                                                                  \
-    UnicodeSetParserTrace unicodeSetParserTrace(                                                        \
-        std::string_view("" __VA_ARGS__).empty() ? __func__ + 5 : ("" __VA_ARGS__), depth, this);       \
-    do {                                                                                                \
-        char const *symbol = ("" __VA_ARGS__);                                                          \
-        if (std::string_view(symbol).empty()) {                                                         \
-            symbol = __func__ + 5;                                                                      \
-        }                                                                                               \
-        UnicodeString ahead;                                                                            \
-        std::string aheadUTF8;                                                                          \
-        printf("%s%s  > %s\n", std::string(depth * 4, ' ').c_str(), symbol,                             \
-               (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str());                           \
-        printf("%s\n", (UnicodeSet(*this)                                                                \
-                           .complement()                                                                \
-                           .complement()                                                                \
-                           .toPattern(ahead)                                                            \
-                           .toUTF8String(aheadUTF8)                                                     \
-                           .c_str(),""));                                                                   \
-    } while (false)
-#else
-#define U_UNICODESET_TRACE(...)                                                                         \
-    do {                                                                                                \
-    } while (false)
-#endif
-
 #define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
     do {                                                                                                \
     constexpr std::string_view functionName = __func__;\
@@ -490,7 +446,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
                                  int32_t depth,
                                  UErrorCode &ec) {
     clear();
-    U_UNICODESET_TRACE();
 
     if (depth > MAX_DEPTH) {
         U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(),
@@ -507,7 +462,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
     UnicodeString prettyPrintedPattern;
     if (lexer.resemblesPropertyPattern()) {
         // UnicodeSet ::= property-query | named-element
-        U_UNICODESET_TRACE("property-query | named-element");
         lexer.getCharacterIterator().skipIgnored(lexer.charsOptions());
         UnicodeSet propertyQuery;
         propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec);
@@ -575,7 +529,6 @@ void UnicodeSet::parseUnion(Lexer &lexer,
                             int32_t depth,
                             bool &containsRestrictions,
                             UErrorCode &ec) {
-    U_UNICODESET_TRACE();
     // Union ::= Terms
     //         | UnescapedHyphenMinus Terms
     //         | Terms UnescapedHyphenMinus
@@ -623,7 +576,6 @@ void UnicodeSet::parseTerm(Lexer &lexer,
                            int32_t depth,
                            bool &containsRestriction,
                            UErrorCode &ec) {
-    U_UNICODESET_TRACE();
     // Term ::= Elements
     //        | Restriction
     if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') ||
@@ -643,7 +595,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
                                   UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                   int32_t depth,
                                   UErrorCode &ec) {
-    U_UNICODESET_TRACE();
     // Restriction ::= UnicodeSet
     //               | Intersection ::= Restriction & UnicodeSet
     //               | Difference   ::= Restriction - UnicodeSet
@@ -695,7 +646,6 @@ void UnicodeSet::parseElements(Lexer &lexer,
                                UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                int32_t depth,
                                UErrorCode &ec) {
-    U_UNICODESET_TRACE();
     // Elements     ::= Element
     //                | Range
     // Range        ::= RangeElement - RangeElement

From 5c44163384ac9fc889dc70383864a4a864a741f5 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 21 Aug 2025 17:19:19 +0200
Subject: [PATCH 38/56] ifdef out the remaining traces

---
 icu4c/source/common/uniset_props.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 0652194441fa..7ec0ddb58944 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -380,6 +380,10 @@ namespace {
 
 constexpr int32_t MAX_DEPTH = 100;
 
+#define U_DEBUGGING_UNICODESET_PARSING 0
+
+#if U_DEBUGGING_UNICODESET_PARSING
+
 #define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
     do {                                                                                                \
     constexpr std::string_view functionName = __func__;\
@@ -407,6 +411,22 @@ constexpr int32_t MAX_DEPTH = 100;
         return;                                                                                         \
     } while (false)
 
+#else
+
+#define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
+    do {                                                                                                \
+        if (U_FAILURE(ec)) {                                                                            \
+            return;                                                                                     \
+        }                                                                                               \
+    } while (false)
+#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec)                               \
+    do {                                                                                                \
+        (ec) = U_MALFORMED_SET;                                                                         \
+        return;                                                                                         \
+    } while (false)
+
+#endif
+
 }  // namespace
 
 /**

From da4b123626e7f158b47f7b60296140e30ecf758c Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 21 Aug 2025 17:28:11 +0200
Subject: [PATCH 39/56] Remove the old code

---
 icu4c/source/common/uniset_props.cpp | 359 ---------------------------
 1 file changed, 359 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 7ec0ddb58944..3d9774473c76 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -767,365 +767,6 @@ void UnicodeSet::parseElements(Lexer &lexer,
     return;
 }
 
-    #if 0
-    while (mode != 2 && !chars.atEnd()) {
-        U_ASSERT((lastItem == 0 && op == 0) ||
-                 (lastItem == 1 && (op == 0 || op == u'-')) ||
-                 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
-
-        UChar32 c = 0;
-        UBool literal = false;
-        UnicodeSet* nested = nullptr; // alias - do not delete
-
-        // -------- Check for property pattern
-
-        // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
-        int8_t setMode = 0;
-        if (resemblesPropertyPattern(chars, opts)) {
-            setMode = 2;
-        }
-
-        // -------- Parse '[' of opening delimiter OR nested set.
-        // If there is a nested set, use `setMode' to define how
-        // the set should be parsed.  If the '[' is part of the
-        // opening delimiter for this pattern, parse special
-        // strings "[", "[^", "[-", and "[^-".  Check for stand-in
-        // characters representing a nested set in the symbol
-        // table.
-
-        else {
-            // Prepare to backup if necessary
-            chars.getPos(backup);
-            c = chars.next(opts, literal, ec);
-            if (U_FAILURE(ec)) return;
-
-            if (c == u'[' && !literal) {
-                if (mode == 1) {
-                    chars.setPos(backup); // backup
-                    setMode = 1;
-                } else {
-                    // Handle opening '[' delimiter
-                    mode = 1;
-                    patLocal.append(u'[');
-                    chars.getPos(backup); // prepare to backup
-                    c = chars.next(opts, literal, ec); 
-                    if (U_FAILURE(ec)) return;
-                    if (c == u'^' && !literal) {
-                        invert = true;
-                        patLocal.append(u'^');
-                        chars.getPos(backup); // prepare to backup
-                        c = chars.next(opts, literal, ec);
-                        if (U_FAILURE(ec)) return;
-                    }
-                    // Fall through to handle special leading '-';
-                    // otherwise restart loop for nested [], \p{}, etc.
-                    if (c == u'-') {
-                        literal = true;
-                        // Fall through to handle literal '-' below
-                    } else {
-                        chars.setPos(backup); // backup
-                        continue;
-                    }
-                }
-            } else if (symbols != nullptr) {
-                const UnicodeFunctor *m = symbols->lookupMatcher(c);
-                if (m != nullptr) {
-                    const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
-                    if (ms == nullptr) {
-                        ec = U_MALFORMED_SET;
-                        return;
-                    }
-                    // casting away const, but `nested' won't be modified
-                    // (important not to modify stored set)
-                    nested = const_cast<UnicodeSet*>(ms);
-                    setMode = 3;
-                }
-            }
-        }
-
-        // -------- Handle a nested set.  This either is inline in
-        // the pattern or represented by a stand-in that has
-        // previously been parsed and was looked up in the symbol
-        // table.
-
-        if (setMode != 0) {
-            if (lastItem == 1) {
-                if (op != 0) {
-                    // syntaxError(chars, "Char expected after operator");
-                    ec = U_MALFORMED_SET;
-                    return;
-                }
-                add(lastChar, lastChar);
-                _appendToPat(patLocal, lastChar, false);
-                lastItem = 0;
-                op = 0;
-            }
-
-            if (op == u'-' || op == u'&') {
-                patLocal.append(op);
-            }
-
-            if (nested == nullptr) {
-                // lazy allocation
-                if (!scratch.allocate()) {
-                    ec = U_MEMORY_ALLOCATION_ERROR;
-                    return;
-                }
-                nested = scratch.pointer();
-            }
-            switch (setMode) {
-            case 1:
-                nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
-                break;
-            case 2:
-                chars.skipIgnored(opts);
-                nested->applyPropertyPattern(chars, patLocal, ec);
-                if (U_FAILURE(ec)) return;
-                break;
-            case 3: // `nested' already parsed
-                nested->_toPattern(patLocal, false);
-                break;
-            }
-
-            usePat = true;
-
-            if (mode == 0) {
-                // Entire pattern is a category; leave parse loop
-                *this = *nested;
-                mode = 2;
-                break;
-            }
-
-            switch (op) {
-            case u'-':
-                removeAll(*nested);
-                break;
-            case u'&':
-                retainAll(*nested);
-                break;
-            case 0:
-                addAll(*nested);
-                break;
-            }
-
-            op = 0;
-            lastItem = 2;
-
-            continue;
-        }
-
-        if (mode == 0) {
-            // syntaxError(chars, "Missing '['");
-            ec = U_MALFORMED_SET;
-            return;
-        }
-
-        // -------- Parse special (syntax) characters.  If the
-        // current character is not special, or if it is escaped,
-        // then fall through and handle it below.
-
-        if (!literal) {
-            switch (c) {
-            case u']':
-                if (lastItem == 1) {
-                    add(lastChar, lastChar);
-                    _appendToPat(patLocal, lastChar, false);
-                }
-                // Treat final trailing '-' as a literal
-                if (op == u'-') {
-                    add(op, op);
-                    patLocal.append(op);
-                } else if (op == u'&') {
-                    // syntaxError(chars, "Trailing '&'");
-                    ec = U_MALFORMED_SET;
-                    return;
-                }
-                patLocal.append(u']');
-                mode = 2;
-                continue;
-            case u'-':
-                if (op == 0) {
-                    if (lastItem != 0) {
-                        op = static_cast<char16_t>(c);
-                        continue;
-                    } else {
-                        // Treat final trailing '-' as a literal
-                        add(c, c);
-                        c = chars.next(opts, literal, ec);
-                        if (U_FAILURE(ec)) return;
-                        if (c == u']' && !literal) {
-                            patLocal.append(u"-]", 2);
-                            mode = 2;
-                            continue;
-                        }
-                    }
-                }
-                // syntaxError(chars, "'-' not after char or set");
-                ec = U_MALFORMED_SET;
-                return;
-            case u'&':
-                if (lastItem == 2 && op == 0) {
-                    op = static_cast<char16_t>(c);
-                    continue;
-                }
-                // syntaxError(chars, "'&' not after set");
-                ec = U_MALFORMED_SET;
-                return;
-            case u'^':
-                // syntaxError(chars, "'^' not after '['");
-                ec = U_MALFORMED_SET;
-                return;
-            case u'{':
-                if (op != 0) {
-                    // syntaxError(chars, "Missing operand after operator");
-                    ec = U_MALFORMED_SET;
-                    return;
-                }
-                if (lastItem == 1) {
-                    add(lastChar, lastChar);
-                    _appendToPat(patLocal, lastChar, false);
-                }
-                lastItem = 0;
-                buf.truncate(0);
-                {
-                    UBool ok = false;
-                    while (!chars.atEnd()) {
-                        c = chars.next(opts, literal, ec);
-                        if (U_FAILURE(ec)) return;
-                        if (c == u'}' && !literal) {
-                            ok = true;
-                            break;
-                        }
-                        buf.append(c);
-                    }
-                    if (!ok) {
-                        // syntaxError(chars, "Invalid multicharacter string");
-                        ec = U_MALFORMED_SET;
-                        return;
-                    }
-                }
-                // We have new string. Add it to set and continue;
-                // we don't need to drop through to the further
-                // processing
-                add(buf);
-                patLocal.append(u'{');
-                _appendToPat(patLocal, buf, false);
-                patLocal.append(u'}');
-                continue;
-            case SymbolTable::SYMBOL_REF:
-                //         symbols  nosymbols
-                // [a-$]   error    error (ambiguous)
-                // [a$]    anchor   anchor
-                // [a-$x]  var "x"* literal '$'
-                // [a-$.]  error    literal '$'
-                // *We won't get here in the case of var "x"
-                {
-                    chars.getPos(backup);
-                    c = chars.next(opts, literal, ec);
-                    if (U_FAILURE(ec)) return;
-                    UBool anchor = (c == u']' && !literal);
-                    if (symbols == nullptr && !anchor) {
-                        c = SymbolTable::SYMBOL_REF;
-                        chars.setPos(backup);
-                        break; // literal '$'
-                    }
-                    if (anchor && op == 0) {
-                        if (lastItem == 1) {
-                            add(lastChar, lastChar);
-                            _appendToPat(patLocal, lastChar, false);
-                        }
-                        add(U_ETHER);
-                        usePat = true;
-                        patLocal.append(static_cast<char16_t>(SymbolTable::SYMBOL_REF));
-                        patLocal.append(u']');
-                        mode = 2;
-                        continue;
-                    }
-                    // syntaxError(chars, "Unquoted '$'");
-                    ec = U_MALFORMED_SET;
-                    return;
-                }
-            default:
-                break;
-            }
-        }
-
-        // -------- Parse literal characters.  This includes both
-        // escaped chars ("\u4E01") and non-syntax characters
-        // ("a").
-
-        switch (lastItem) {
-        case 0:
-            lastItem = 1;
-            lastChar = c;
-            break;
-        case 1:
-            if (op == u'-') {
-                if (lastChar >= c) {
-                    // Don't allow redundant (a-a) or empty (b-a) ranges;
-                    // these are most likely typos.
-                    // syntaxError(chars, "Invalid range");
-                    ec = U_MALFORMED_SET;
-                    return;
-                }
-                add(lastChar, c);
-                _appendToPat(patLocal, lastChar, false);
-                patLocal.append(op);
-                _appendToPat(patLocal, c, false);
-                lastItem = 0;
-                op = 0;
-            } else {
-                add(lastChar, lastChar);
-                _appendToPat(patLocal, lastChar, false);
-                lastChar = c;
-            }
-            break;
-        case 2:
-            if (op != 0) {
-                // syntaxError(chars, "Set expected after operator");
-                ec = U_MALFORMED_SET;
-                return;
-            }
-            lastChar = c;
-            lastItem = 1;
-            break;
-        }
-    }
-
-    if (mode != 2) {
-        // syntaxError(chars, "Missing ']'");
-        ec = U_MALFORMED_SET;
-        return;
-    }
-
-    chars.skipIgnored(opts);
-
-    /**
-     * Handle global flags (invert, case insensitivity).  If this
-     * pattern should be compiled case-insensitive, then we need
-     * to close over case BEFORE COMPLEMENTING.  This makes
-     * patterns like /[^abc]/i work.
-     */
-    if ((options & USET_CASE_MASK) != 0) {
-        (this->*caseClosure)(options);
-    }
-    if (invert) {
-        complement().removeAllStrings();  // code point complement
-    }
-
-    // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
-    // generated pattern.
-    if (usePat) {
-        rebuiltPat.append(patLocal);
-    } else {
-        _generatePattern(rebuiltPat, false);
-    }
-    if (isBogus() && U_SUCCESS(ec)) {
-        // We likely ran out of memory. AHHH!
-        ec = U_MEMORY_ALLOCATION_ERROR;
-    }
-#endif
-
 //----------------------------------------------------------------
 // Property set implementation
 //----------------------------------------------------------------

From ff092dcabea44a42717f91dc0ef73067c2520593 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 21 Aug 2025 17:51:03 +0200
Subject: [PATCH 40/56] Unused variables

---
 icu4c/source/common/uniset_props.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 3d9774473c76..6706cb996a6a 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -627,7 +627,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
     // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
     // return.
     for (;;) {
-        RuleCharacterIterator::Pos beforeOperator;
         if (lexer.lookahead().standIn() != nullptr) {
             // Not an operator, end of the Restriction.
             return;
@@ -710,7 +709,6 @@ void UnicodeSet::parseElements(Lexer &lexer,
     }
     lexer.lookahead().moveAfter();
     _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false);
-    RuleCharacterIterator::Pos beforeOperator;
     if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) {
         // No operator,
         // Elements ::= Element

From f0bd37b67e6760e052f7fb6921d53d9b4447d0d4 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Tue, 26 Aug 2025 20:07:09 +0200
Subject: [PATCH 41/56] Some work towards a proper lexer

---
 icu4c/source/common/uniset_props.cpp | 377 +++++++++++++++++++--------
 1 file changed, 274 insertions(+), 103 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 6706cb996a6a..c3961ca2c3ae 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -235,35 +235,31 @@ class UnicodeSet::Lexer {
                              : 0)),
           symbols_(symbols) {}
 
-    class Lookahead {
+    class LexicalElement {
       public:
-        bool isUnescaped(UChar32 codePoint) const {
-            return !escaped_ && codePoint_ == codePoint;
+        bool isPropertyQuery() const {
+            return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY;
         }
 
-        bool isUnescapedNotStandIn(UChar32 codePoint) {
-            return isUnescaped(codePoint) && standIn() == nullptr;
+        bool isNamedElement() const {
+            return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY;
         }
 
-        void moveAfter() {
-            lexer_.chars_.setPos(after_);
-            lexer_.ahead_.reset();
+        bool isSetOperator(const char16_t op) const {
+            return U_SUCCESS(errorCode_) && category_ == SET_OPERATOR && string_[0] == op;
         }
 
-        bool acceptUnescapedNotStandIn(UChar32 codePoint) {
-            if (isUnescapedNotStandIn(codePoint)) {
-                moveAfter();
-                return true;
-            }
-            return false;
+        bool isStringLiteral() const {
+            return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL;
+        }
+
+        bool isBracketedElement() const {
+            return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT;
         }
 
-        bool acceptUnescaped(UChar32 codePoint) {
-            if (isUnescaped(codePoint)) {
-                moveAfter();
-                return true;
+        std::optional<UnicodeString> element() const {
+            if (U_SUCCESS(errorCode_) && category_) {
             }
-            return false;
         }
 
         UChar32 codePoint(UErrorCode &errorCode) const {
@@ -277,50 +273,37 @@ class UnicodeSet::Lexer {
             return escaped_;
         }
 
-        const UnicodeSet *standIn() {
-            if (!standIn_.has_value()) {
-                if (lexer_.symbols_ == nullptr) {
-                    standIn_ = nullptr;
-                } else {
-                    standIn_ =
-                        dynamic_cast<const UnicodeSet *>(lexer_.symbols_->lookupMatcher(codePoint_));
-                }
+        const UnicodeSet *standIn() const {
+            if (U_FAILURE(errorCode_) || category_ != STAND_IN) {
+                return nullptr;
             }
-            return *standIn_;
+            return standIn_;
         };
 
-        // Some parts of the grammar need two tokens of lookahead.  The second lookahead is not cached.
-        Lookahead oneMore() {
-            return oneMore(lexer_.charsOptions_);
-        }
-
-        Lookahead oneMore(int32_t charsOptions) {
-            RuleCharacterIterator::Pos before;
-            lexer_.chars_.getPos(before);
-            lexer_.chars_.setPos(after_);
-            auto const result = Lookahead(lexer_, lexer_.chars_, charsOptions);
-            lexer_.chars_.setPos(before);
-            return result;
-        }
-
-        Lookahead(Lexer &lexer, RuleCharacterIterator &chars, int32_t charsOptions)
-            : lexer_(lexer) {
-            RuleCharacterIterator::Pos before;
-            chars.getPos(before);
-            codePoint_ = chars.next(charsOptions, escaped_, errorCode_);
-            chars.getPos(after_);
-            chars.setPos(before);
-        }
-
       private:
-        Lexer &lexer_;
+        // See https://unicode.org/reports/tr61#Lexical-Elements.
+        enum Category {
+            SET_OPERATOR,
+            LITERAL_ELEMENT,
+            ESCAPED_ELEMENT,
+            NAMED_ELEMENT,
+            BRACKETED_ELEMENT,
+            STRING_LITERAL,
+            PROPERTY_QUERY,
+            // ICU extension: A literal-element, escaped-element, or set-operator or (but not
+            // bracketed-element) which is mapped to a set.  This may also be an unescaped '{', in which
+            // case bracketed-element and string-literal are inaccessible.
+            STAND_IN,
+        };
+        LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after,
+              UErrorCode errorCode, const UnicodeSet *standIn)
+            : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode),
+              standIn_(standIn) {}
+        Category category_;
+        UnicodeString string_;
         RuleCharacterIterator::Pos after_;
         UErrorCode errorCode_;
-        UChar32 codePoint_;
-        UBool escaped_;
-        // `std::nullopt` if we have not yet called `lookupMatcher`, otherwise the result of
-        // `lookupMatcher` (which may be `nullptr`).
-        std::optional<const UnicodeSet *> standIn_;
+        const UnicodeSet *standIn_;
 
         friend class Lexer;
     };
@@ -330,32 +313,40 @@ class UnicodeSet::Lexer {
                pattern_.tempSubString(parsePosition_.getIndex(), 60);
     }
 
-    Lookahead &lookahead() {
+    const bool acceptSetOperator(char16_t op) {
+        if (lookahead().isSetOperator(op)) {
+            advance();
+            return true;
+        }
+        return false;
+    }
+
+    const LexicalElement &lookahead() {
         if (!ahead_.has_value()) {
-            ahead_.emplace(*this, chars_, charsOptions_);
+            const RuleCharacterIterator::Pos before = getPos();
+            ahead_.emplace(nextToken());
+            chars_.setPos(before);
         }
         return *ahead_;
     }
 
-    bool resemblesPropertyPattern() {
-        Lookahead first =
-            Lookahead(*this, chars_, charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES);
-        if (first.codePoint_ != u'[' && first.codePoint_ != u'\\') {
-            return false;
+    const LexicalElement &lookahead2() {
+        if (!ahead2_.has_value()) {
+            const RuleCharacterIterator::Pos before = getPos();
+            chars_.setPos(lookahead().after_);
+            ahead_.emplace(nextToken());
+            chars_.setPos(before);
         }
-        Lookahead second = first.oneMore(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
-                                                           RuleCharacterIterator::SKIP_WHITESPACE));
-        return (first.codePoint_ == u'[' && second.codePoint_ == ':') ||
-               (first.codePoint_ == u'\\' &&
-                (second.codePoint_ == u'p' || second.codePoint_ == u'P' || second.codePoint_ == u'N'));
+        return *ahead_;
     }
 
     // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so
     // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to
-    // `lookahead`.  The resulting `RuleCharacterIterator` must not be used once `lookahead` has been
-    // called.
+    // `lookahead`.  The resulting `RuleCharacterIterator` must not be used once `lookahead` or
+    // `lookahead2` has been called again.
     RuleCharacterIterator &getCharacterIterator() {
         ahead_.reset();
+        ahead2_.reset();
         return chars_;
     }
 
@@ -367,13 +358,192 @@ class UnicodeSet::Lexer {
         return chars_.atEnd();
     }
 
+    void advance() {
+        chars_.setPos(lookahead().after_);
+        ahead_ = ahead2_;
+        ahead2_.reset();
+    }
+
   private:
+    // A version of getPos that returns its position instead of taking it as at out parameter, so we
+    // can have const positions.
+    RuleCharacterIterator::Pos getPos() const {
+        RuleCharacterIterator::Pos result;
+        chars_.getPos(result);
+        return result;
+    }
+
+    LexicalElement nextToken() {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const RuleCharacterIterator::Pos before = getPos();
+        // First try to get the next character without parsing escapes.
+        UBool unusedEscaped;
+        const UChar32 first =
+            chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode);
+        // '[', named-element, and property-query cannot be disabled by stand-in.
+        if (first == u'[' || first == u'\\') {
+            // This could be a property-query or named-element.
+            const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                                 RuleCharacterIterator::SKIP_WHITESPACE),
+                                               unusedEscaped, errorCode);
+            if ((first == u'[' && second == u':') ||
+                (first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) {
+                if (second == u'N') {
+                    const UChar32 third =
+                        chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                      RuleCharacterIterator::SKIP_WHITESPACE),
+                                    unusedEscaped, errorCode);
+                    if (third == u'{') {
+                        while (!chars_.atEnd()) {
+                            UChar32 last =
+                                chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                              RuleCharacterIterator::SKIP_WHITESPACE),
+                                            unusedEscaped, errorCode);
+                            if (last == u'}') {
+                                return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(),
+                                                      errorCode,
+                                                      /*standIn=*/nullptr);
+                            }
+                        }
+                    }
+                    return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), U_MALFORMED_SET,
+                                          /*standIn=*/nullptr);
+                }
+                // Do not skip whitespace so we can recognize unspaced :].  Lex escapes and
+                // named-element: while ICU does not support string-valued properties and thus has no
+                // use for escapes, we still want to lex through escapes to allow downstream
+                // implementations (mostly unicodetools) to implement string-valued properties.
+                if (first == u'\\') {
+                    const UChar32 third =
+                        chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                      RuleCharacterIterator::SKIP_WHITESPACE),
+                                    unusedEscaped, errorCode);
+                    if (third != u'{') {
+                        return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(),
+                                              U_MALFORMED_SET,
+                                              /*standIn=*/nullptr);
+                    }
+                }
+                RuleCharacterIterator::Pos beforePenultimate = getPos();
+                UChar32 penultimateUnescaped =
+                    chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                  RuleCharacterIterator::SKIP_WHITESPACE),
+                                unusedEscaped, errorCode);
+
+                while (!chars_.atEnd()) {
+                    const RuleCharacterIterator::Pos beforeLast = getPos();
+                    UChar32 lastUnescaped =
+                        chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                      RuleCharacterIterator::SKIP_WHITESPACE),
+                                    unusedEscaped, errorCode);
+                    if (penultimateUnescaped == u'\\') {
+                        if (lastUnescaped == 'N') {
+                            const UChar32 namedElementOpening =
+                                chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                              RuleCharacterIterator::SKIP_WHITESPACE),
+                                            unusedEscaped, errorCode);
+                            bool namedElementOK = false;
+                            if (namedElementOpening == u'{') {
+                                while (!chars_.atEnd()) {
+                                    UChar32 namedElementLast = chars_.next(
+                                        charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                          RuleCharacterIterator::SKIP_WHITESPACE),
+                                        unusedEscaped, errorCode);
+                                    if (namedElementLast == u'}') {
+                                        namedElementOK = true;
+                                    }
+                                }
+                            }
+                            if (!namedElementOK) {
+                                return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(),
+                                                      U_MALFORMED_SET,
+                                                      /*standIn=*/nullptr);
+                            }
+                        } else {
+                            // There must be an escaped-element starting at beforePenultimate.  Go
+                            // back there and advance through it.
+                            chars_.setPos(beforePenultimate);
+                            chars_.next(charsOptions_ & ~RuleCharacterIterator::SKIP_WHITESPACE,
+                                        unusedEscaped, errorCode);
+                        }
+                        // Neither a named-element nor an escaped-element can be part of a closing :].
+                        lastUnescaped = -1;
+                    } else if ((first == u'[' && penultimateUnescaped == u':' &&
+                                lastUnescaped == u']') ||
+                               (first == u'\\' && lastUnescaped == u'}')) {
+                        return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode,
+                                              /*standIn=*/nullptr);
+                    }
+                    beforePenultimate = beforeLast;
+                    penultimateUnescaped = lastUnescaped;
+                }
+                return;
+            }
+        }
+        if (first == u'[') {
+            return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
+                                  /*standIn=*/nullptr);
+        }
+
+        if (first == u'\\') {
+            // Now try to parse the escape.
+            chars_.setPos(before);
+            UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode);
+            const UnicodeSet *const standIn =
+                dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(codePoint));
+            return LexicalElement(standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT
+                                                     : LexicalElement::STAND_IN,
+                                  standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(),
+                                  getPos(),
+                                  errorCode, standIn);
+        }
+        if (const UnicodeSet *const standIn =
+                dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
+            standIn != nullptr) {
+            return LexicalElement(LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn);
+        }
+
+        switch (first) {
+        case u'&':
+        case u'-':
+        case u'[':
+        case u']':
+        case u'^':
+        case u'$':
+            // We make $ a set-operator to handle the ICU extensions involving $.
+            return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(),
+                                  errorCode,
+                                  /*standIn=*/nullptr);
+        case u'{': {
+            UnicodeString string;
+            UBool escaped;
+            UChar32 next;
+            while (!chars_.atEnd()) {
+                next = chars_.next(charsOptions_, escaped, errorCode);
+                if (!escaped && next == u'}') {
+                    return LexicalElement(string.length() == 1 ? LexicalElement::BRACKETED_ELEMENT
+                                                               : LexicalElement::STRING_LITERAL,
+                                          std::move(string), getPos(), errorCode,
+                                          /*standIn=*/nullptr);
+                }
+                string.append(next);
+            }
+            return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET,
+                                  /*standIn=*/nullptr);
+        }
+        default:
+            return LexicalElement(LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(),
+                                  errorCode, nullptr);
+        }
+    }
+
     const UnicodeString &pattern_;
     const ParsePosition &parsePosition_;
     RuleCharacterIterator &chars_;
     const int32_t charsOptions_;
     const SymbolTable *const symbols_;
-    std::optional<Lookahead> ahead_;
+    std::optional<LexicalElement> ahead_;
+    std::optional<LexicalElement> ahead2_;
 };
 
 namespace {
@@ -480,8 +650,11 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
     bool preserveSyntaxInPattern = false;
     // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc.
     UnicodeString prettyPrintedPattern;
-    if (lexer.resemblesPropertyPattern()) {
+    if (lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) {
         // UnicodeSet ::= property-query | named-element
+        // NOTE(egg): For now, we throw away the work that the lexer did to find out where the
+        // property-query or named-element ended in order to retain the existing buggy behaviour of
+        // variables containing property queries.
         lexer.getCharacterIterator().skipIgnored(lexer.charsOptions());
         UnicodeSet propertyQuery;
         propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec);
@@ -493,31 +666,30 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
         // UnicodeSet ::=                [   Union ]
         //              | Complement ::= [ ^ Union ]
         // Extension:
-        //              | MatcherSymbol
-        // Where a MatcherSymbol may be a character or an escape.
-        // Strings that would match MatcherSymbol effectively get removed from
+        //              | stand-in
+        // Where a stand-in may be a character or an escape.
+        // Strings that would match stand-in effectively get removed from
         // all other terminals of the grammar, except [.
-        if (lexer.lookahead().acceptUnescaped(u'[')) {
+        if (lexer.acceptSetOperator(u'[')) {
             prettyPrintedPattern.append(u'[');
-            if (lexer.lookahead().acceptUnescaped(u'^')) {
+            if (lexer.acceptSetOperator(u'^')) {
                 prettyPrintedPattern.append(u'^');
                 isComplement = true;
             }
             parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth,
                        /*containsRestrictions=*/preserveSyntaxInPattern, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
-            if (!lexer.lookahead().acceptUnescaped(u']')) {
+            if (!lexer.acceptSetOperator(u']')) {
                 U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec);
             }
             prettyPrintedPattern.append(u']');
-        } else {
-            const UnicodeSet *set = lexer.lookahead().standIn();
-            if (set != nullptr) {
-                *this = *set;
-                this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false);
-                lexer.lookahead().moveAfter();
+        } else if (const UnicodeSet *const standIn = lexer.lookahead().standIn();
+                   standIn != nullptr) {
+                *this = *standIn;
+            this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false);
+            lexer.advance();
                 return;
-            }
+        } else {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)",
                                                  lexer.lookahead().codePoint(ec), lexer, ec);
         }
@@ -555,33 +727,32 @@ void UnicodeSet::parseUnion(Lexer &lexer,
     //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
     // Terms ::= ""
     //         | Terms Term
-    if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) {
+    if (lexer.acceptSetOperator(u'-')) {
         add(u'-');
         // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
         // final one, for consistency with older ICU behaviour.
         rebuiltPat.append(u"\\-");
     }
     while (!lexer.atEnd()) {
-        if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) {
+        if (lexer.acceptSetOperator(u'-')) {
             // We can be here on the first iteration: [--] is allowed by the
             // grammar and by the old parser.
             rebuiltPat.append(u'-');
             add(u'-');
             return;
-        } else if (lexer.lookahead().isUnescapedNotStandIn(u'$')) {
-            Lexer::Lookahead afterDollar = lexer.lookahead().oneMore();
-            if (afterDollar.isUnescaped(u']')) {
+        } else if (lexer.lookahead().isSetOperator(u'$')) {
+            if (lexer.lookahead2().isSetOperator(u']')) {
                 // ICU extensions: A $ is allowed as a literal-element.
                 // A Term at the end of a Union consisting of a single $ is an anchor.
                 rebuiltPat.append(u'$');
                 // Consume the dollar.
-                lexer.lookahead().moveAfter();
+                lexer.advance();
                 add(U_ETHER);
                 containsRestrictions = true;
                 return;
             }
         }
-        if (lexer.lookahead().isUnescapedNotStandIn(u']')) {
+        if (lexer.lookahead().isSetOperator(u']')) {
             return;
         }
         parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec);
@@ -598,8 +769,8 @@ void UnicodeSet::parseTerm(Lexer &lexer,
                            UErrorCode &ec) {
     // Term ::= Elements
     //        | Restriction
-    if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') ||
-        lexer.resemblesPropertyPattern()) {
+    if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isSetOperator('[') ||
+        lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) {
         containsRestriction = true;
         parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
@@ -627,26 +798,22 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
     // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
     // return.
     for (;;) {
-        if (lexer.lookahead().standIn() != nullptr) {
-            // Not an operator, end of the Restriction.
-            return;
-        }
-        if (lexer.lookahead().acceptUnescaped(u'&')) {
+        if (lexer.acceptSetOperator(u'&')) {
             // Intersection ::= Restriction & UnicodeSet
             rebuiltPat.append(u'&');
             UnicodeSet rightHandSide;
             rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             retainAll(rightHandSide);
-        } else if (lexer.lookahead().isUnescaped(u'-')) {
+        } else if (lexer.lookahead().isSetOperator(u'-')) {
             // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
             // of a Difference or an UnescapedHyphenMinus in the enclosing Union.
-            if (lexer.lookahead().oneMore().isUnescaped(u']')) {
+            if (lexer.lookahead2().isSetOperator(u']')) {
                 // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it.
                 return;
             }
             // Consume the hyphen-minus.
-            lexer.lookahead().moveAfter();
+            lexer.advance();
             // Difference ::= Restriction - UnicodeSet
             rebuiltPat.append(u'-');
             UnicodeSet rightHandSide;
@@ -672,6 +839,10 @@ void UnicodeSet::parseElements(Lexer &lexer,
     //                | escaped-element
     // Element      ::= RangeElement
     //                | string-literal
+    //                | bracketed-element
+    if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) {
+        add(lexer.lookahead().)
+    }
     const UChar32 first = lexer.lookahead().codePoint(ec);
     U_UNICODESET_RETURN_IF_ERROR(ec);
     if (!lexer.lookahead().escaped()) {

From b78c0ce1364c3c9d727b722082c3970a25efabb0 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 13:45:46 +0200
Subject: [PATCH 42/56] A proper lexer

---
 icu4c/source/common/uniset_props.cpp | 315 ++++++++++++++++-----------
 1 file changed, 184 insertions(+), 131 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index c3961ca2c3ae..63356dcc2b11 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -18,6 +18,9 @@
 *   Character property dependent functions moved here from uniset.cpp
 */
 
+#include <array>
+#include <optional>
+
 #include "unicode/utypes.h"
 #include "unicode/uniset.h"
 #include "unicode/parsepos.h"
@@ -44,7 +47,6 @@
 #include "umutex.h"
 #include "uassert.h"
 #include "hash.h"
-#include <optional>
 
 U_NAMESPACE_USE
 
@@ -242,7 +244,7 @@ class UnicodeSet::Lexer {
         }
 
         bool isNamedElement() const {
-            return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY;
+            return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT;
         }
 
         bool isSetOperator(const char16_t op) const {
@@ -258,19 +260,20 @@ class UnicodeSet::Lexer {
         }
 
         std::optional<UnicodeString> element() const {
-            if (U_SUCCESS(errorCode_) && category_) {
+            if (U_SUCCESS(errorCode_) &&
+                (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
+                 category_ == BRACKETED_ELEMENT || category_ == STRING_LITERAL)) {
+                return string_;
             }
+            return std::nullopt;
         }
 
-        UChar32 codePoint(UErrorCode &errorCode) const {
-            if (!U_FAILURE(errorCode)) {
-                errorCode = errorCode;
+        std::optional<UChar32> codePoint() const {
+            if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
+                                          category_ == BRACKETED_ELEMENT)) {
+                return string_.char32At(0);
             }
-            return codePoint_;
-        }
-
-        bool escaped() const {
-            return escaped_;
+            return std::nullopt;
         }
 
         const UnicodeSet *standIn() const {
@@ -278,11 +281,29 @@ class UnicodeSet::Lexer {
                 return nullptr;
             }
             return standIn_;
-        };
+        }
+
+        const UErrorCode& errorCode() const{
+          return errorCode_;
+        }
+
+        UnicodeString debugString() const {
+            UnicodeString result;
+            if (U_FAILURE(errorCode_)) {
+                result.append(u"Ill-formed token (");
+                result.append(UnicodeString::fromUTF8(u_errorName(errorCode_)));
+                result.append(u"), possibly ");
+            }
+            result.append(category_names_[category_]);
+            result.append(u" '");
+            result.append(sourceText_);
+            result.append(u"'");
+            return result;
+        }
 
       private:
         // See https://unicode.org/reports/tr61#Lexical-Elements.
-        enum Category {
+        enum Category : std::uint8_t {
             SET_OPERATOR,
             LITERAL_ELEMENT,
             ESCAPED_ELEMENT,
@@ -295,15 +316,26 @@ class UnicodeSet::Lexer {
             // case bracketed-element and string-literal are inaccessible.
             STAND_IN,
         };
+        static constexpr std::array<std::u16string_view, 8> category_names_{{
+            u"set-operator",
+            u"literal-element",
+            u"escaped-element",
+            u"named-element",
+            u"bracketed-element",
+            u"string-literal",
+            u"property-query",
+            u"stand-in",
+        }};
         LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after,
-              UErrorCode errorCode, const UnicodeSet *standIn)
+                       UErrorCode errorCode, const UnicodeSet *standIn, std::u16string_view sourceText)
             : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode),
-              standIn_(standIn) {}
+              standIn_(standIn), sourceText_(sourceText) {}
         Category category_;
         UnicodeString string_;
         RuleCharacterIterator::Pos after_;
         UErrorCode errorCode_;
         const UnicodeSet *standIn_;
+        std::u16string_view sourceText_;
 
         friend class Lexer;
     };
@@ -332,21 +364,27 @@ class UnicodeSet::Lexer {
 
     const LexicalElement &lookahead2() {
         if (!ahead2_.has_value()) {
+            // Note that if someone has called `getCharacterIterator` and played with the result,
+            // `before` may not actually be before `ahead_`, but we do not actually depend on this here,
+            // since we start from ahead_.after_.
             const RuleCharacterIterator::Pos before = getPos();
             chars_.setPos(lookahead().after_);
-            ahead_.emplace(nextToken());
+            ahead2_.emplace(nextToken());
             chars_.setPos(before);
         }
-        return *ahead_;
+        return *ahead2_;
     }
 
-    // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so
-    // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to
-    // `lookahead`.  The resulting `RuleCharacterIterator` must not be used once `lookahead` or
-    // `lookahead2` has been called again.
+    // For use in older functions that take the `RuleCharacterIterator` directly.
+    // Any advancement of the resulting `RuleCharacterIterator` has no effect on the result of subsequent
+    // calls to `lookahead`, `lookahead2`, `advance`, or `acceptSetOperator`.
+    // Once `advance` or `acceptSetOperator` has been called, the result of a call to
+    // `getCharacterIterator` preceding the call to `advance` or `acceptSetOperator` must no longer be
+    // used.
     RuleCharacterIterator &getCharacterIterator() {
-        ahead_.reset();
-        ahead2_.reset();
+        // Make sure we compute a correct `ahead_.after_` so we do not depend on the current value of
+        // `getPos()` for lexing.
+        lookahead();
         return chars_;
     }
 
@@ -359,6 +397,10 @@ class UnicodeSet::Lexer {
     }
 
     void advance() {
+        // If someone called `getCharacterIterator`, we are now changing the character iterator under
+        // their feet; further, we may not have an `ahead_`, so if they keep playing with it we would be
+        // working on incorrect values of `getPos`.  This is why the result of `getCharacterIterator`
+        // must no longer be used.
         chars_.setPos(lookahead().after_);
         ahead_ = ahead2_;
         ahead2_.reset();
@@ -375,6 +417,7 @@ class UnicodeSet::Lexer {
 
     LexicalElement nextToken() {
         UErrorCode errorCode = U_ZERO_ERROR;
+        const int32_t start = parsePosition_.getIndex();
         const RuleCharacterIterator::Pos before = getPos();
         // First try to get the next character without parsing escapes.
         UBool unusedEscaped;
@@ -382,6 +425,7 @@ class UnicodeSet::Lexer {
             chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode);
         // '[', named-element, and property-query cannot be disabled by stand-in.
         if (first == u'[' || first == u'\\') {
+            const RuleCharacterIterator::Pos afterFirst = getPos();
             // This could be a property-query or named-element.
             const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
                                                                  RuleCharacterIterator::SKIP_WHITESPACE),
@@ -402,12 +446,16 @@ class UnicodeSet::Lexer {
                             if (last == u'}') {
                                 return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(),
                                                       errorCode,
-                                                      /*standIn=*/nullptr);
+                                                      /*standIn=*/nullptr,
+                                                      std::u16string_view(pattern_).substr(
+                                                          start, parsePosition_.getIndex() - start));
                             }
                         }
                     }
-                    return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), U_MALFORMED_SET,
-                                          /*standIn=*/nullptr);
+                    return LexicalElement(
+                        LexicalElement::NAMED_ELEMENT, {}, getPos(), U_ILLEGAL_ARGUMENT_ERROR,
+                        /*standIn=*/nullptr,
+                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
                 }
                 // Do not skip whitespace so we can recognize unspaced :].  Lex escapes and
                 // named-element: while ICU does not support string-valued properties and thus has no
@@ -420,8 +468,10 @@ class UnicodeSet::Lexer {
                                     unusedEscaped, errorCode);
                     if (third != u'{') {
                         return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(),
-                                              U_MALFORMED_SET,
-                                              /*standIn=*/nullptr);
+                                              U_ILLEGAL_ARGUMENT_ERROR,
+                                              /*standIn=*/nullptr,
+                                              std::u16string_view(pattern_).substr(
+                                                  start, parsePosition_.getIndex() - start));
                     }
                 }
                 RuleCharacterIterator::Pos beforePenultimate = getPos();
@@ -456,8 +506,10 @@ class UnicodeSet::Lexer {
                             }
                             if (!namedElementOK) {
                                 return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(),
-                                                      U_MALFORMED_SET,
-                                                      /*standIn=*/nullptr);
+                                                      U_ILLEGAL_ARGUMENT_ERROR,
+                                                      /*standIn=*/nullptr,
+                                                      std::u16string_view(pattern_).substr(
+                                                          start, parsePosition_.getIndex() - start));
                             }
                         } else {
                             // There must be an escaped-element starting at beforePenultimate.  Go
@@ -471,18 +523,27 @@ class UnicodeSet::Lexer {
                     } else if ((first == u'[' && penultimateUnescaped == u':' &&
                                 lastUnescaped == u']') ||
                                (first == u'\\' && lastUnescaped == u'}')) {
-                        return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode,
-                                              /*standIn=*/nullptr);
+                        return LexicalElement(
+                            LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode,
+                            /*standIn=*/nullptr,
+                            std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
                     }
                     beforePenultimate = beforeLast;
                     penultimateUnescaped = lastUnescaped;
                 }
-                return;
+                return LexicalElement(
+                    LexicalElement::PROPERTY_QUERY, {}, getPos(), U_ILLEGAL_ARGUMENT_ERROR,
+                    /*standIn=*/nullptr,
+                    std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
             }
+            // Not a property-query.
+            chars_.setPos(afterFirst);
         }
         if (first == u'[') {
-            return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
-                                  /*standIn=*/nullptr);
+            return LexicalElement(
+                LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
+                /*standIn=*/nullptr,
+                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
 
         if (first == u'\\') {
@@ -490,50 +551,62 @@ class UnicodeSet::Lexer {
             chars_.setPos(before);
             UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode);
             const UnicodeSet *const standIn =
-                dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(codePoint));
-            return LexicalElement(standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT
-                                                     : LexicalElement::STAND_IN,
-                                  standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(),
-                                  getPos(),
-                                  errorCode, standIn);
+                symbols_ == nullptr
+                    ? nullptr
+                    : dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(codePoint));
+            return LexicalElement(
+                standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT : LexicalElement::STAND_IN,
+                standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), getPos(), errorCode,
+                standIn, std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
-        if (const UnicodeSet *const standIn =
+        if (symbols_ != nullptr) {
+            const UnicodeSet *const standIn =
                 dynamic_cast<const UnicodeSet *>(symbols_->lookupMatcher(first));
-            standIn != nullptr) {
-            return LexicalElement(LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn);
+            if (standIn != nullptr) {
+                return LexicalElement(
+                    LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn,
+                    std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
+            }
         }
 
         switch (first) {
         case u'&':
         case u'-':
-        case u'[':
         case u']':
         case u'^':
         case u'$':
             // We make $ a set-operator to handle the ICU extensions involving $.
-            return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(),
-                                  errorCode,
-                                  /*standIn=*/nullptr);
+            return LexicalElement(
+                LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), errorCode,
+                /*standIn=*/nullptr,
+                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         case u'{': {
             UnicodeString string;
             UBool escaped;
             UChar32 next;
+            int32_t codePointCount = 0;
             while (!chars_.atEnd()) {
                 next = chars_.next(charsOptions_, escaped, errorCode);
                 if (!escaped && next == u'}') {
-                    return LexicalElement(string.length() == 1 ? LexicalElement::BRACKETED_ELEMENT
-                                                               : LexicalElement::STRING_LITERAL,
-                                          std::move(string), getPos(), errorCode,
-                                          /*standIn=*/nullptr);
+                    return LexicalElement(
+                        codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT
+                                            : LexicalElement::STRING_LITERAL,
+                        std::move(string), getPos(), errorCode,
+                        /*standIn=*/nullptr,
+                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
                 }
                 string.append(next);
+                codePointCount += 1;
             }
-            return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET,
-                                  /*standIn=*/nullptr);
+            return LexicalElement(
+                LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET,
+                /*standIn=*/nullptr,
+                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
         default:
-            return LexicalElement(LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(),
-                                  errorCode, nullptr);
+            return LexicalElement(
+                LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr,
+                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
         }
     }
 
@@ -550,7 +623,7 @@ namespace {
 
 constexpr int32_t MAX_DEPTH = 100;
 
-#define U_DEBUGGING_UNICODESET_PARSING 0
+#define U_DEBUGGING_UNICODESET_PARSING 1
 
 #if U_DEBUGGING_UNICODESET_PARSING
 
@@ -573,11 +646,15 @@ constexpr int32_t MAX_DEPTH = 100;
         static_assert(functionName.substr(0, 5) == "parse");                                            \
         std::string actualUTF8;                                                                         \
         std::string contextUTF8;                                                                        \
-        printf("*** Expected %s, got '%s' %s\n", (expected),                                            \
+        printf("*** Expected %s, got %s %s\n", (expected),                                              \
                UnicodeString(actual).toUTF8String(actualUTF8).c_str(),                                  \
                lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str());                      \
         printf("--- in %s l. %d\n", __func__ + 5, __LINE__);                                            \
-        (ec) = U_MALFORMED_SET;                                                                         \
+        if (U_FAILURE(lexer.lookahead().errorCode())) {                                                 \
+            (ec) = lexer.lookahead().errorCode();                                                       \
+        } else {                                                                                        \
+            (ec) = U_MALFORMED_SET;                                                                     \
+        }                                                                                               \
         return;                                                                                         \
     } while (false)
 
@@ -591,7 +668,11 @@ constexpr int32_t MAX_DEPTH = 100;
     } while (false)
 #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec)                               \
     do {                                                                                                \
-        (ec) = U_MALFORMED_SET;                                                                         \
+        if (U_FAILURE(lexer.lookahead().errorCode())) {                                                 \
+            (ec) = lexer.lookahead().errorCode();                                                       \
+        } else {                                                                                        \
+            (ec) = U_MALFORMED_SET;                                                                     \
+        }                                                                                               \
         return;                                                                                         \
     } while (false)
 
@@ -652,13 +733,16 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
     UnicodeString prettyPrintedPattern;
     if (lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) {
         // UnicodeSet ::= property-query | named-element
-        // NOTE(egg): For now, we throw away the work that the lexer did to find out where the
+        // NOTE(egg): For now, we ignore the work that the lexer did to find out where the
         // property-query or named-element ended in order to retain the existing buggy behaviour of
         // variables containing property queries.
         lexer.getCharacterIterator().skipIgnored(lexer.charsOptions());
         UnicodeSet propertyQuery;
         propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
+        // But now, we go back to our lexing and advance through the property-query or named-element as
+        // lexed.  If there was no error, the old and the new code should agree on the extent.
+        lexer.advance();
         addAll(propertyQuery);
         preserveSyntaxInPattern = true;
     } else {
@@ -680,7 +764,7 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
                        /*containsRestrictions=*/preserveSyntaxInPattern, ec);
             U_UNICODESET_RETURN_IF_ERROR(ec);
             if (!lexer.acceptSetOperator(u']')) {
-                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec);
+                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().debugString(), lexer, ec);
             }
             prettyPrintedPattern.append(u']');
         } else if (const UnicodeSet *const standIn = lexer.lookahead().standIn();
@@ -691,7 +775,8 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
                 return;
         } else {
             U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)",
-                                                 lexer.lookahead().codePoint(ec), lexer, ec);
+                                                 lexer.lookahead().debugString(), lexer,
+                                                 ec);
         }
     }
 
@@ -841,46 +926,27 @@ void UnicodeSet::parseElements(Lexer &lexer,
     //                | string-literal
     //                | bracketed-element
     if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) {
-        add(lexer.lookahead().)
+        add(*lexer.lookahead().element());
+        rebuiltPat.append(u'{');
+        _appendToPat(rebuiltPat, *lexer.lookahead().element(), /*escapeUnprintable=*/false);
+        rebuiltPat.append(u'}');
+        lexer.advance();
+        return;
     }
-    const UChar32 first = lexer.lookahead().codePoint(ec);
-    U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (!lexer.lookahead().escaped()) {
-        switch (first) {
-        case u'-':
-        case u'&':
-        case u'[':
-        case u']':
-        case u'^':
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, lexer, ec);
-        case u'{': {
-            lexer.lookahead().moveAfter();
-            rebuiltPat.append(u'{');
-            UnicodeString string;
-            while (!lexer.atEnd()) {
-                if (lexer.lookahead().acceptUnescaped('}')) {
-                    rebuiltPat.append(u'}');
-                    add(string);
-                    return;
-                }
-                const UChar32 c = lexer.lookahead().codePoint(ec);
-                U_UNICODESET_RETURN_IF_ERROR(ec);
-                lexer.lookahead().moveAfter();
-                _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false);
-                string.append(c);
-            }
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, lexer, ec);
-        }
-        case u'}':
-        case u'$':
-            // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
-        default:
-            break;
-        }
+    UChar32 first;
+    if (lexer.lookahead().isSetOperator(u'$')) {
+        // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
+        first = u'$';
+    } else if (lexer.lookahead().codePoint().has_value()) {
+        first = *lexer.lookahead().codePoint();
+    } else {
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal | bracketed-element",
+                                             lexer.lookahead().debugString(),
+                                             lexer, ec);
     }
-    lexer.lookahead().moveAfter();
+    lexer.advance();
     _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false);
-    if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) {
+    if (!lexer.lookahead().isSetOperator(u'-')) {
         // No operator,
         // Elements ::= Element
         add(first);
@@ -888,50 +954,37 @@ void UnicodeSet::parseElements(Lexer &lexer,
     }
     // Here the grammar requires two tokens of lookahead to figure out whether the - the operator
     // of a Range or an UnescapedHyphenMinus in the enclosing Union.
-    if (lexer.lookahead().oneMore().isUnescaped(u']')) {
+    if (lexer.lookahead2().isSetOperator(u']')) {
         // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it.
         add(first);
         return;
     }
     // Consume the hyphen-minus.
-    lexer.lookahead().moveAfter();
+    lexer.advance();
     // Elements ::= Range ::= RangeElement - RangeElement
     rebuiltPat.append(u'-');
-    const UChar32 last = lexer.lookahead().codePoint(ec);
-    U_UNICODESET_RETURN_IF_ERROR(ec);
-    if (lexer.lookahead().standIn() != nullptr) {
-        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec);
-    }
-    if (!lexer.lookahead().escaped()) {
-        switch (last) {
-        case u'-':
-        case u'&':
-        case u'[':
-        case u']':
-        case u'^':
-        case u'{':
-            U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec);
-        case u'$': {
-            // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union.
-            // This is an extension.
-            if (lexer.lookahead().oneMore().isUnescaped(u']')) {
-                U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", u']',
-                                                     lexer, ec);
-            }
-            break;
-        }
-        case u'}':
-            // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
-        default:
-            break;
+    UChar32 last;
+    if (lexer.lookahead().isSetOperator(u'$')) {
+        // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union.
+        // This is an extension.
+        last = u'$';
+        if (lexer.lookahead2().isSetOperator(u']')) {
+            U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $",
+                                                 lexer.lookahead().debugString() + u" followed by " +
+                                                     lexer.lookahead2().debugString(),
+                                                 lexer, ec);
         }
+    } else if (lexer.lookahead().codePoint().has_value()) {
+        last = *lexer.lookahead().codePoint();
+    } else {
+        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec);
     }
-    lexer.lookahead().moveAfter();
-    _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false);
     if (last <= first) {
         U_UNICODESET_RETURN_WITH_PARSE_ERROR(
             "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec);
     }
+    lexer.advance();
+    _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false);
     add(first, last);
     return;
 }

From d61b09076bcb1cc2b9728d13143a337dddaa76c5 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 13:54:37 +0200
Subject: [PATCH 43/56] =?UTF-8?q?Don=E2=80=99t=20report=20end=20of=20text?=
 =?UTF-8?q?=20as=20a=20literal-element?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 icu4c/source/common/unicode/uniset.h |  3 ---
 icu4c/source/common/uniset_props.cpp | 36 +++++++---------------------
 2 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index d805fd9e8156..538eb264e974 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1799,9 +1799,6 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
     static UBool resemblesPropertyPattern(const UnicodeString& pattern,
                                           int32_t pos);
 
-    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
-                                          int32_t iterOpts);
-
     /**
      * Parse the given property pattern at the given parse position
      * and set this UnicodeSet to the result.
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 63356dcc2b11..0de278955e74 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -315,8 +315,9 @@ class UnicodeSet::Lexer {
             // bracketed-element) which is mapped to a set.  This may also be an unescaped '{', in which
             // case bracketed-element and string-literal are inaccessible.
             STAND_IN,
+            END_OF_TEXT,
         };
-        static constexpr std::array<std::u16string_view, 8> category_names_{{
+        static constexpr std::array<std::u16string_view, 9> category_names_{{
             u"set-operator",
             u"literal-element",
             u"escaped-element",
@@ -325,6 +326,7 @@ class UnicodeSet::Lexer {
             u"string-literal",
             u"property-query",
             u"stand-in",
+            u"(end of text)",
         }};
         LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after,
                        UErrorCode errorCode, const UnicodeSet *standIn, std::u16string_view sourceText)
@@ -417,6 +419,11 @@ class UnicodeSet::Lexer {
 
     LexicalElement nextToken() {
         UErrorCode errorCode = U_ZERO_ERROR;
+        chars_.skipIgnored(charsOptions_);
+        if (chars_.atEnd()) {
+            return LexicalElement(LexicalElement::END_OF_TEXT, {}, getPos(), errorCode,
+                                  /*standIn=*/nullptr, u"");
+        }
         const int32_t start = parsePosition_.getIndex();
         const RuleCharacterIterator::Pos before = getPos();
         // First try to get the next character without parsing escapes.
@@ -1337,33 +1344,6 @@ UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
 }
 
-/**
- * Return true if the given iterator appears to point at a
- * property pattern.  Regardless of the result, return with the
- * iterator unchanged.
- * @param chars iterator over the pattern characters.  Upon return
- * it will be unchanged.
- * @param iterOpts RuleCharacterIterator options
- */
-UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
-                                           int32_t iterOpts) {
-    // NOTE: literal will always be false, because we don't parse escapes.
-    UBool result = false, literal;
-    UErrorCode ec = U_ZERO_ERROR;
-    iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
-    RuleCharacterIterator::Pos pos;
-    chars.getPos(pos);
-    UChar32 c = chars.next(iterOpts, literal, ec);
-    if (c == u'[' || c == u'\\') {
-        UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
-                               literal, ec);
-        result = (c == u'[') ? (d == u':') :
-                               (d == u'N' || d == u'p' || d == u'P');
-    }
-    chars.setPos(pos);
-    return result && U_SUCCESS(ec);
-}
-
 /**
  * Parse the given property pattern at the given parse position.
  */

From 40460d9dbd5cfbdaa393c9e7f302a7d840e8a2af Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 13:57:57 +0200
Subject: [PATCH 44/56] Turn off traces

---
 icu4c/source/common/uniset_props.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 0de278955e74..65df7a005384 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -630,7 +630,7 @@ namespace {
 
 constexpr int32_t MAX_DEPTH = 100;
 
-#define U_DEBUGGING_UNICODESET_PARSING 1
+#define U_DEBUGGING_UNICODESET_PARSING 0
 
 #if U_DEBUGGING_UNICODESET_PARSING
 

From e39c4d1e4ece506b8719e592c630fba46635545e Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 14:39:21 +0200
Subject: [PATCH 45/56] ICU-23179 Test more edge cases when mapping syntax
 characters to sets

---
 icu4c/source/test/intltest/usettest.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 3a97c7a4db01..841e94cc8030 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -2044,6 +2044,9 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode));
     symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode));
     symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode));
+    symbols.add(u':', UnicodeSet(u"[{colon}]", errorCode));
+    symbols.add(u'\\', UnicodeSet(u"[{reverseSolidus}]", errorCode));
+    symbols.add(u'p', UnicodeSet(u"[{latinSmallLetterP}]", errorCode));
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
                       expectedLookups, variables] : std::vector<TestCase>{
             {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"},
@@ -2051,11 +2054,15 @@ void UnicodeSetTest::TestLookupSymbolTable() {
             // The hyphen no longer works as set difference.
             {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"},
             {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"},
+            // An initial HYPHEN-MINUS is still treated as a literal '-', but a final one is treated
+            // as a set.
+            {u"[-1]", U_ZERO_ERROR, uR"([\-[bc]])", uR"([\-bc])"},
+            {u"[1-]", U_ZERO_ERROR, u"[[bc][{hyphenMinus}]]", u"[bc{hyphenMinus}]"},
             // String literals no longer work.
             {uR"([!-/{0}])", U_ZERO_ERROR,
             u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]",
             u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"},
-            // The ampersand no longer works as set difference.
+            // The ampersand no longer works as set intersection.
             {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]",
             u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"},
             // Complementing still works.
@@ -2070,6 +2077,9 @@ void UnicodeSetTest::TestLookupSymbolTable() {
             {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"},
             // Anchors are gone.
             {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"},
+            // Property queries are unaffected.
+            {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
+            {uR"(\p{Co})", U_ZERO_ERROR, uR"(\p{Co})", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
         }) {
         UnicodeString actual;
         UErrorCode errorCode = U_ZERO_ERROR;
@@ -2094,6 +2104,7 @@ void UnicodeSetTest::TestLookupSymbolTable() {
     for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
                       expectedLookups, variables] : std::vector<TestCase>{
             {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
+            {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"},
             {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"},
         }) {
         UnicodeString actual;

From 93d92964cbea164d18a7844c32973dd29f331f8f Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 14:49:10 +0200
Subject: [PATCH 46/56] Deal with the ambiguous - and ^

---
 icu4c/source/common/uniset_props.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 65df7a005384..f44d89feffd1 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -355,6 +355,14 @@ class UnicodeSet::Lexer {
         return false;
     }
 
+    const bool acceptStandInWithSymbol(char16_t op) {
+        if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) {
+            advance();
+            return true;
+        }
+        return false;
+    }
+
     const LexicalElement &lookahead() {
         if (!ahead_.has_value()) {
             const RuleCharacterIterator::Pos before = getPos();
@@ -761,9 +769,11 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
         // Where a stand-in may be a character or an escape.
         // Strings that would match stand-in effectively get removed from
         // all other terminals of the grammar, except [.
+        // When mapped by the symbol table, whether ^ and - are treated as set operators depends on where
+        // in the grammar we are, hence `acceptStandInWithSymbol`.
         if (lexer.acceptSetOperator(u'[')) {
             prettyPrintedPattern.append(u'[');
-            if (lexer.acceptSetOperator(u'^')) {
+            if (lexer.acceptSetOperator(u'^') || lexer.acceptStandInWithSymbol(u'^')) {
                 prettyPrintedPattern.append(u'^');
                 isComplement = true;
             }
@@ -819,13 +829,15 @@ void UnicodeSet::parseUnion(Lexer &lexer,
     //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
     // Terms ::= ""
     //         | Terms Term
-    if (lexer.acceptSetOperator(u'-')) {
+    if (lexer.acceptSetOperator(u'-') || lexer.acceptStandInWithSymbol(u'-')) {
         add(u'-');
         // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
         // final one, for consistency with older ICU behaviour.
         rebuiltPat.append(u"\\-");
     }
     while (!lexer.atEnd()) {
+        // Note that while a HYPHEN-MINUS mapped by the symbol table is treated as a literal at the
+        // beginning of the Union, it is treated as a set elsewhere, including at the end.
         if (lexer.acceptSetOperator(u'-')) {
             // We can be here on the first iteration: [--] is allowed by the
             // grammar and by the old parser.

From 7940892ea27803038c5ec255174b2bb8fd13f06d Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 27 Aug 2025 14:53:53 +0200
Subject: [PATCH 47/56] Update sequence expectations

---
 icu4c/source/test/intltest/usettest.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 841e94cc8030..8e4c2832e826 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -1971,13 +1971,13 @@ void UnicodeSetTest::TestLookupSymbolTable() {
             U_ZERO_ERROR,
             u"[[a-z]-[bc]]",
             u"[ad-z]",
-            {u'0', u'-', u"one", u"one", u'1', u']'},
+            {u'0', u'-', u"one", u'1', u']'},
             {{u"zero", u"0"}, {u"one", u"1"}}},
             {u"[$zero-$one]",
             U_ZERO_ERROR,
             u"[[a-z]-[bc]]",
             u"[ad-z]",
-            {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'},
+            {u"zero", u"zero", u'0', u'-', u"one", u'1', u']'},
             {{u"zero", u"0"}, {u"one", u"1"}}},
             // If the variable expands to multiple symbols, only the first one is sequenced right after
             // the variable lookup.
@@ -1985,7 +1985,7 @@ void UnicodeSetTest::TestLookupSymbolTable() {
             U_ZERO_ERROR,
             u"[[bc][a-z]]",
             u"[a-z]",
-            {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'},
+            {u"ten", u"ten", u'1', u'0', u']'},
             {{u"ten", u"10"}}},
             // Substitution of lookupMatcher symbols takes place after unescaping.
             {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}},

From d3cc9eac5503ff58452095049a7eeed4efc9adc8 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Tue, 2 Sep 2025 11:06:20 +0200
Subject: [PATCH 48/56] warnings

---
 icu4c/source/common/unicode/uniset.h | 2 --
 icu4c/source/common/uniset_props.cpp | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index 538eb264e974..fd0a89a413fe 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1744,8 +1744,6 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
 
     void parseElements(Lexer &lexer,
                        UnicodeString &rebuiltPat,
-                       UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
-                       int32_t depth,
                        UErrorCode &ec);
 
 
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index f44d89feffd1..e88d09d867d0 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -347,7 +347,7 @@ class UnicodeSet::Lexer {
                pattern_.tempSubString(parsePosition_.getIndex(), 60);
     }
 
-    const bool acceptSetOperator(char16_t op) {
+    bool acceptSetOperator(char16_t op) {
         if (lookahead().isSetOperator(op)) {
             advance();
             return true;
@@ -355,7 +355,7 @@ class UnicodeSet::Lexer {
         return false;
     }
 
-    const bool acceptStandInWithSymbol(char16_t op) {
+    bool acceptStandInWithSymbol(char16_t op) {
         if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) {
             advance();
             return true;
@@ -933,8 +933,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
 
 void UnicodeSet::parseElements(Lexer &lexer,
                                UnicodeString &rebuiltPat,
-                               UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
-                               int32_t depth,
                                UErrorCode &ec) {
     // Elements     ::= Element
     //                | Range

From 3cfc4ae9a8f884ab06e81f2de961d5dd2c3bd989 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Tue, 2 Sep 2025 13:42:26 +0200
Subject: [PATCH 49/56] Clarify some comments

---
 icu4c/source/common/unicode/uniset.h |  8 +++++---
 icu4c/source/common/uniset_props.cpp | 21 ++++++++++++++-------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h
index fd0a89a413fe..feacf399fb02 100644
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@@ -1705,10 +1705,12 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter {
                       UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                       UErrorCode &ec);
 
-    // Recursive descent parsing with no backtracking.  These functions parse the syntactic categories
+    // Recursive-descent predictive parsing.  These functions parse the syntactic categories
     // matching their name in the base grammar of PD UTR #56 (before the highlighted changes are
-    // applied).  They add to *this the elements of the set that the parsed construct represents.
-    // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
+    // applied).
+    // See https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations.
+    // `parseUnicodeSet` clears `*this` and makes it represent the parsed UnicodeSet; all other functions
+    // add the set represented by the parsed construct to `*this`.
 
     class Lexer;
 
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index e88d09d867d0..23bf162535c7 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -761,7 +761,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer,
         addAll(propertyQuery);
         preserveSyntaxInPattern = true;
     } else {
-        // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^.
         // UnicodeSet ::=                [   Union ]
         //              | Complement ::= [ ^ Union ]
         // Extension:
@@ -890,17 +889,25 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
                                   UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
                                   int32_t depth,
                                   UErrorCode &ec) {
-    // Restriction ::= UnicodeSet
-    //               | Intersection ::= Restriction & UnicodeSet
-    //               | Difference   ::= Restriction - UnicodeSet
+    // Parse a https://www.unicode.org/reports/tr61/#Restriction:
+    //   Restriction  ::= UnicodeSet
+    //                  | Intersection
+    //                  | Difference
+    //   Intersection ::= Restriction & UnicodeSet
+    //   Difference   ::= Restriction - UnicodeSet
+    // or, rewritten to be LL,
+    //   Restriction   ::= UnicodeSet RightHandSide
+    //   RightHandSide ::= ""
+    //                   | & UnicodeSet RightHandSide
+    //                   | - UnicodeSet RightHandSide
     // Start by parsing the first UnicodeSet.
     UnicodeSet leftHandSide;
     leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
     addAll(leftHandSide);
     U_UNICODESET_RETURN_IF_ERROR(ec);
-    // Now keep looking for an operator that would continue the Restriction.
-    // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and
-    // return.
+    // Now keep looking for an operator that would continue the RightHandSide.
+    // The loop terminates because when we run out of source text, the lookahead token will not be a set
+    // operator, so that we hit the else branch and return.
     for (;;) {
         if (lexer.acceptSetOperator(u'&')) {
             // Intersection ::= Restriction & UnicodeSet

From 629bc8988006c65ffbe7deae7ebbb903cfb7c562 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 3 Sep 2025 02:25:07 +0200
Subject: [PATCH 50/56] more discursive comments

---
 icu4c/source/common/uniset_props.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 23bf162535c7..dc8f5a97e05e 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -896,10 +896,12 @@ void UnicodeSet::parseRestriction(Lexer &lexer,
     //   Intersection ::= Restriction & UnicodeSet
     //   Difference   ::= Restriction - UnicodeSet
     // or, rewritten to be LL,
-    //   Restriction   ::= UnicodeSet RightHandSide
-    //   RightHandSide ::= ""
-    //                   | & UnicodeSet RightHandSide
-    //                   | - UnicodeSet RightHandSide
+    //   Restriction    ::= UnicodeSet RightHandSides
+    //   RightHandSides ::= ""
+    //                    | & UnicodeSet RightHandSides
+    //                    | - UnicodeSet RightHandSides
+    // but note that the tree resulting from this LL version is not an expression tree: the
+    // operations are left-associative.
     // Start by parsing the first UnicodeSet.
     UnicodeSet leftHandSide;
     leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);

From cace9d71c0f7028f6fc7918ec3ad4be19a60ffae Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Wed, 3 Sep 2025 04:45:14 +0200
Subject: [PATCH 51/56] make it compile

---
 icu4c/source/common/uniset_props.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index dc8f5a97e05e..162876e6e5bd 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -878,7 +878,7 @@ void UnicodeSet::parseTerm(Lexer &lexer,
         parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     } else {
-        parseElements(lexer, rebuiltPat, caseClosure, depth, ec);
+        parseElements(lexer, rebuiltPat, ec);
         U_UNICODESET_RETURN_IF_ERROR(ec);
     }
 }

From 03235934848cf31d1171632d5989227f1b4f120b Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 8 Sep 2025 13:38:16 +0200
Subject: [PATCH 52/56] libstdc++ dependencies

---
 icu4c/source/test/depstest/dependencies.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt
index deced67cff80..0278f1476894 100644
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -141,6 +141,10 @@ group: cplusplus
     # "Calls the current terminate handler."
     std::terminate()
 
+    # From std::array::at in libstdc++.  Note that we never call std::array::at, only operator[]
+    # which is noexcept.
+    std::__throw_out_of_range_fmt(char const*, ...)
+
 group: iostream
     "std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)"
     "std::basic_ios<char, std::char_traits<char> >::eof() const"

From bcb7ac0c022b99b4a72b83d8f422703399991c69 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Mon, 8 Sep 2025 19:10:49 +0200
Subject: [PATCH 53/56] quote?

---
 icu4c/source/test/depstest/dependencies.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt
index 0278f1476894..c6f6f0c38ff2 100644
--- a/icu4c/source/test/depstest/dependencies.txt
+++ b/icu4c/source/test/depstest/dependencies.txt
@@ -143,7 +143,7 @@ group: cplusplus
 
     # From std::array::at in libstdc++.  Note that we never call std::array::at, only operator[]
     # which is noexcept.
-    std::__throw_out_of_range_fmt(char const*, ...)
+    "std::__throw_out_of_range_fmt(char const*, ...)"
 
 group: iostream
     "std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)"

From 66cceeb6b024171270a94b367775b03cf8084972 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Tue, 9 Sep 2025 15:13:25 +0200
Subject: [PATCH 54/56] No infinite loops in the lexer

---
 icu4c/source/common/uniset_props.cpp    |  8 +++----
 icu4c/source/test/intltest/usettest.cpp | 31 +++++++++++++++++++------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
index 162876e6e5bd..13dcd71ca8ce 100644
--- a/icu4c/source/common/uniset_props.cpp
+++ b/icu4c/source/common/uniset_props.cpp
@@ -453,7 +453,7 @@ class UnicodeSet::Lexer {
                                                       RuleCharacterIterator::SKIP_WHITESPACE),
                                     unusedEscaped, errorCode);
                     if (third == u'{') {
-                        while (!chars_.atEnd()) {
+                        while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
                             UChar32 last =
                                 chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
                                                               RuleCharacterIterator::SKIP_WHITESPACE),
@@ -495,7 +495,7 @@ class UnicodeSet::Lexer {
                                                   RuleCharacterIterator::SKIP_WHITESPACE),
                                 unusedEscaped, errorCode);
 
-                while (!chars_.atEnd()) {
+                while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
                     const RuleCharacterIterator::Pos beforeLast = getPos();
                     UChar32 lastUnescaped =
                         chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
@@ -509,7 +509,7 @@ class UnicodeSet::Lexer {
                                             unusedEscaped, errorCode);
                             bool namedElementOK = false;
                             if (namedElementOpening == u'{') {
-                                while (!chars_.atEnd()) {
+                                while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
                                     UChar32 namedElementLast = chars_.next(
                                         charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
                                                           RuleCharacterIterator::SKIP_WHITESPACE),
@@ -600,7 +600,7 @@ class UnicodeSet::Lexer {
             UBool escaped;
             UChar32 next;
             int32_t codePointCount = 0;
-            while (!chars_.atEnd()) {
+            while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
                 next = chars_.next(charsOptions_, escaped, errorCode);
                 if (!escaped && next == u'}') {
                     return LexicalElement(
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 8e4c2832e826..37417dd8d4f1 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4715,6 +4715,19 @@ void UnicodeSetTest::TestToPatternOutput() {
 }
 
 void UnicodeSetTest::TestParseErrors() {
+    for (const auto expression : std::vector<std::u16string_view>{
+             uR"([\u])",
+             uR"([\x{}])",
+             uR"([\9])",
+         }) {
+        UErrorCode errorCode = U_ZERO_ERROR;
+        const UnicodeSet set(expression, errorCode);
+        if (errorCode != U_MALFORMED_UNICODE_ESCAPE) {
+            UnicodeString s;
+            errln(expression + u": Expected U_MALFORMED_UNICODE_ESCAPE, got " + u_errorName(errorCode) +
+                  ", set is " + UnicodeSet(set).complement().complement().toPattern(s));
+        }
+    }
     for (const auto expression : std::vector<std::u16string_view>{
             // Java error message: "Char expected after operator".
             u"[a-[b]]",
@@ -4758,13 +4771,17 @@ void UnicodeSetTest::TestParseErrors() {
         }
     }
     for (const auto expression : std::vector<std::u16string_view>{
-            // Java error message: "Invalid property pattern".
-            u"[:]",
-            uR"(\p)"
-            u"[:^]",
-            uR"(\P)",
-            uR"(\N)",
-        }) {
+             // Java error message: "Invalid property pattern".
+             u"[:]",
+             uR"(\p)"
+             u"[:^]",
+             uR"(\P)",
+             uR"(\N)",
+             uR"([\p{Some_Property=\u}])",
+             uR"([:Some_Property=\u:])",
+             uR"(\p{Some_Property=\N{SOME CHARACTER}})",
+             uR"([\N{}])",
+         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {

From f79b35c3ab559a6f57e9fb6574c14629d034dc5a Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Tue, 9 Sep 2025 15:57:08 +0200
Subject: [PATCH 55/56] That is well-formed

---
 icu4c/source/test/intltest/usettest.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 37417dd8d4f1..b34cd17985e7 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4718,7 +4718,6 @@ void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
              uR"([\u])",
              uR"([\x{}])",
-             uR"([\9])",
          }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);

From 94cc56c97b109404f39b2a213fc24c6895102843 Mon Sep 17 00:00:00 2001
From: Robin Leroy <egg.robin.leroy@gmail.com>
Date: Thu, 11 Sep 2025 14:31:11 +0200
Subject: [PATCH 56/56] dedent

---
 icu4c/source/test/intltest/usettest.cpp | 28 ++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index b34cd17985e7..052de84bf153 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -4716,9 +4716,9 @@ void UnicodeSetTest::TestToPatternOutput() {
 
 void UnicodeSetTest::TestParseErrors() {
     for (const auto expression : std::vector<std::u16string_view>{
-             uR"([\u])",
-             uR"([\x{}])",
-         }) {
+            uR"([\u])",
+            uR"([\x{}])",
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         if (errorCode != U_MALFORMED_UNICODE_ESCAPE) {
@@ -4770,17 +4770,17 @@ void UnicodeSetTest::TestParseErrors() {
         }
     }
     for (const auto expression : std::vector<std::u16string_view>{
-             // Java error message: "Invalid property pattern".
-             u"[:]",
-             uR"(\p)"
-             u"[:^]",
-             uR"(\P)",
-             uR"(\N)",
-             uR"([\p{Some_Property=\u}])",
-             uR"([:Some_Property=\u:])",
-             uR"(\p{Some_Property=\N{SOME CHARACTER}})",
-             uR"([\N{}])",
-         }) {
+            // Java error message: "Invalid property pattern".
+            u"[:]",
+            uR"(\p)"
+            u"[:^]",
+            uR"(\P)",
+            uR"(\N)",
+            uR"([\p{Some_Property=\u}])",
+            uR"([:Some_Property=\u:])",
+            uR"(\p{Some_Property=\N{SOME CHARACTER}})",
+            uR"([\N{}])",
+        }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
         if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {