From 185a38fd7ebaf13192ff2e0334129558526df0e9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 11 Aug 2025 16:24:10 +0200 Subject: [PATCH 01/56] ICU-22851 Test the error paths in UnicodeSet parsing --- icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 2 + 2 files changed, 63 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..da32687987e8 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() { // begin() & end() return USetElementIterator for which explicit APIs are tested via USet // in a header-only unit test file. } + +void UnicodeSetTest::TestParseErrors() { + for (const auto expression : std::vector{ + // Java error message: "Char expected after operator". + u"[a-[b]]", + // "Missing '['". + u"a-z", + // "Trailing '&'". + u"[[a]&]", + // "'-' not after char or set". + u"[[a]&-[z]]", + u"[[a]--[z]]", + u"[{aa}-{zz}]", + // "'&' not after set". + u"[a&z]", + u"[{aa}&{zz}]", + // "'^' not after '['" + u"[a^z]", // TODO(egg): Exclude from literal-element in PDUTS61. + // "Missing operand after operator". + u"[a-{zz}]", + u"[[a]-{zz}]", + u"[[a]&{zz}]", + // "Invalid multicharacter string". + u"[{aa]", + // "Unquoted '$'". + u"[a-$]", + // "Invalid range". + u"[a-a]", // TODO(egg): Exclude in PDUTS61. + u"[z-a]", + // "Set expected after operator". + u"[[a]-z]", + u"[[a]&z]", + // "Missing ']'". + u"[a-z", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_MALFORMED_SET) { + UnicodeString s; + errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } + for (const auto expression : std::vector{ + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) { + UnicodeString s; + errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..4c5b55a329bb 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestParseErrors(); + private: UBool toPatternAux(UChar32 start, UChar32 end); From 6a650e7065fc286bb136a4996abd665c7005832e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 11 Aug 2025 20:28:53 +0200 Subject: [PATCH 02/56] Call it a day --- icu4c/source/common/unicode/uniset.h | 52 ++++++- icu4c/source/common/uniset_props.cpp | 216 ++++++++++++++++++++++++--- 2 files changed, 237 insertions(+), 31 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 01ac901e3ba1..ddacaaa336ca 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1696,13 +1696,51 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { const SymbolTable* symbols, UErrorCode& status); - void applyPattern(RuleCharacterIterator& chars, - const SymbolTable* symbols, - UnicodeString& rebuiltPat, - uint32_t options, - UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, - UErrorCode& ec); + // Recursive descent parsing. These functions parse the syntactic categories matching their name in + // the base grammar of PD UTR #56 (before the highlighted changes are applied). They add to *this + // the elements of the set that the parsed construct represents. + // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. + + void parseUnicodeSet(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec); + + void parseUnion(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec); + + void parseTerm(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec); + + void parseRestriction(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec); + + void parseElements(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec); + void closeOverCaseInsensitive(bool simple); void closeOverAddCaseMappings(); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 532b17f5063f..64e397ee27fe 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status); + parseUnicodeSet(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status); if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); @@ -242,6 +242,14 @@ class UnicodeSetPointer { constexpr int32_t MAX_DEPTH = 100; +constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { + int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES; + if ((unicodeSetOptions & USET_IGNORE_SPACE) != 0) { + opts |= RuleCharacterIterator::SKIP_WHITESPACE; + } + return opts; +} + } // namespace /** @@ -258,13 +266,13 @@ constexpr int32_t MAX_DEPTH = 100; * @param options a bit mask of zero or more of the following: * IGNORE_SPACE, CASE. */ -void UnicodeSet::applyPattern(RuleCharacterIterator& chars, - const SymbolTable* symbols, - UnicodeString& rebuiltPat, - uint32_t options, - UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, - UErrorCode& ec) { +void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars, + const SymbolTable* symbols, + UnicodeString& rebuiltPat, + uint32_t options, + UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode& ec) { if (U_FAILURE(ec)) return; if (depth > MAX_DEPTH) { ec = U_ILLEGAL_ARGUMENT_ERROR; @@ -275,27 +283,187 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, // Recognized special forms for chars, sets: c-c s-s s&s - int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | - RuleCharacterIterator::PARSE_ESCAPES; - if ((options & USET_IGNORE_SPACE) != 0) { - opts |= RuleCharacterIterator::SKIP_WHITESPACE; + clear(); + + bool isComplement = false; + + if (resemblesPropertyPattern(chars, charsOptions(options))) { + // UnicodeSet ::= property-query | named-singleton + applyPropertyPattern(chars, rebuiltPat, ec); + if (U_FAILURE(ec)) return; + } else { + UBool escaped = false; + // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. + // UnicodeSet ::= [ Union ] + // | Complement ::= [ ^ Union ] + char16_t c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (escaped || c != u'[') { + ec = U_MALFORMED_SET; + return; + } + RuleCharacterIterator::Pos afterBracket; + chars.getPos(afterBracket); + c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (!escaped && c == u'^') { + isComplement = true; + return; + } else { + chars.setPos(afterBracket); + } + parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + if (U_FAILURE(ec)) return; + c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (escaped || c != u']') { + ec = U_MALFORMED_SET; + return; + } } - UnicodeString patLocal, buf; - UBool usePat = false; - UnicodeSetPointer scratch; - RuleCharacterIterator::Pos backup; + /** + * Handle global flags (isComplement, case insensitivity). If this + * pattern should be compiled case-insensitive, then we need + * to close over case BEFORE COMPLEMENTING. This makes + * patterns like /[^abc]/i work. + */ + if ((options & USET_CASE_MASK) != 0) { + (this->*caseClosure)(options); + } + if (isComplement) { + complement().removeAllStrings(); // code point complement + } +} - // mode: 0=before [, 1=between [...], 2=after ] - // lastItem: 0=none, 1=char, 2=set - int8_t lastItem = 0, mode = 0; - UChar32 lastChar = 0; - char16_t op = 0; +void UnicodeSet::parseUnion(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, + UErrorCode &ec) { + UBool escaped = false; + RuleCharacterIterator::Pos position; + chars.getPos(position); + // Union ::= Terms + // | UnescapedHyphenMinus Terms + // | Terms UnescapedHyphenMinus + // | UnescapedHyphenMinus Terms UnescapedHyphenMinus + // Terms ::= "" + // | Terms Term + char16_t c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (!escaped && c == u'-') { + add(u'-'); + } else { + chars.setPos(position); + } + for (;;) { + chars.getPos(position); + c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (!escaped && c == u'-') { + // We can be here on the first iteration: [--] is allowed by the + // grammar and by the old parser. + add(u'-'); + return; + } + chars.setPos(position); + if (!escaped && c == ']') { + return; + } + if (U_FAILURE(ec)) return; + parseTerm(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec); + if (U_FAILURE(ec)) return; + } +} - UBool invert = false; +void UnicodeSet::parseTerm(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec) { + UBool escaped = false; + RuleCharacterIterator::Pos termStart; + chars.getPos(termStart); + // Term ::= Elements + // | Restriction + char16_t c = chars.next(charsOptions(options), escaped, ec); + if (!escaped && c == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { + chars.setPos(termStart); + parseRestriction(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec); + if (U_FAILURE(ec)) return; + } else { + } +} - clear(); +void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, UErrorCode &ec) { + UBool escaped = false; + // Restriction ::= UnicodeSet + // | Intersection ::= Restriction & UnicodeSet + // | Difference ::= Restriction - UnicodeSet + // Start by parsing the first UnicodeSet. + parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth + 1, ec); + if (U_FAILURE(ec)) return; + // Now keep looking for an operator that would continue the Restriction. + for (;;) { + RuleCharacterIterator::Pos beforeOperator; + chars.getPos(beforeOperator); + char16_t c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (!escaped && c == u'&') { + // Intersection ::= Restriction & UnicodeSet + UnicodeSet rightHandSide; + rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, + depth + 1, ec); + if (U_FAILURE(ec)) return; + retainAll(rightHandSide); + } else if (!escaped && c == u'-') { + // Here the grammar requires two tokens of lookahead to figure out whether the - the operator + // of a Difference or an UnescapedHyphenMinus in the enclosing Union. + RuleCharacterIterator::Pos afterOperator; + chars.getPos(afterOperator); + char16_t c = chars.next(charsOptions(options), escaped, ec); + if (U_FAILURE(ec)) return; + if (!escaped && c == u']') { + // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it. + chars.setPos(beforeOperator); + return; + } + chars.setPos(afterOperator); + // Difference ::= Restriction - UnicodeSet + UnicodeSet rightHandSide; + rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, + depth + 1, ec); + if (U_FAILURE(ec)) return; + removeAll(rightHandSide); + } else { + // Not an operator. + chars.setPos(beforeOperator); + return; + } + } +} +void UnicodeSet::parseElements(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + UErrorCode &ec) { + UBool escaped = false; + +} + + #if 0 while (mode != 2 && !chars.atEnd()) { U_ASSERT((lastItem == 0 && op == 0) || (lastItem == 1 && (op == 0 || op == u'-')) || @@ -652,7 +820,7 @@ void UnicodeSet::applyPattern(RuleCharacterIterator& chars, // We likely ran out of memory. AHHH! ec = U_MEMORY_ALLOCATION_ERROR; } -} +#endif //---------------------------------------------------------------- // Property set implementation From 85b8b50761ba5ab8000d09e4f07ccf26cfd70f4f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 20:13:54 +0200 Subject: [PATCH 03/56] Some progress, toPattern is wrong, but what is right? --- icu4c/source/common/unicode/uniset.h | 13 +- icu4c/source/common/uniset_closure.cpp | 2 +- icu4c/source/common/uniset_props.cpp | 268 ++++++++++++++++++++----- 3 files changed, 226 insertions(+), 57 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index ddacaaa336ca..c4c96154fca2 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1696,9 +1696,16 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { const SymbolTable* symbols, UErrorCode& status); - // Recursive descent parsing. These functions parse the syntactic categories matching their name in - // the base grammar of PD UTR #56 (before the highlighted changes are applied). They add to *this - // the elements of the set that the parsed construct represents. + void applyPattern(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + UErrorCode &ec); + + // Recursive descent parsing with no backtracking. These functions parse the syntactic categories + // matching their name in the base grammar of PD UTR #56 (before the highlighted changes are + // applied). They add to *this the elements of the set that the parsed construct represents. // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. void parseUnicodeSet(RuleCharacterIterator &chars, diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index ae777c5facdf..251276adaddb 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, 0, status); + applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); if (U_FAILURE(status)) return *this; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 64e397ee27fe..054a1a932d03 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - parseUnicodeSet(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status); + applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); @@ -250,6 +250,74 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { return opts; } +#if 0 +#define U_UNICODESET_TRACE(...) \ + struct UnicodeSetParserTrace { \ + char const *const symbol_; \ + int const depth_; \ + const UnicodeSet *const that_; \ + UnicodeSetParserTrace(char const *symbol, int depth, const UnicodeSet *that) \ + : symbol_(symbol), depth_(depth), that_(that) {} \ + ~UnicodeSetParserTrace() { \ + UnicodeString ahead; \ + std::string aheadUTF8; \ + printf("%s%s\n", std::string(depth_ * 4, ' ').c_str(), symbol_); \ + printf("%s\n", (UnicodeSet(*that_) \ + .complement() \ + .complement() \ + .toPattern(ahead) \ + .toUTF8String(aheadUTF8) \ + .c_str(),"")); \ + } \ + }; \ + UnicodeSetParserTrace unicodeSetParserTrace( \ + std::string_view("" __VA_ARGS__).empty() ? __func__ + 5 : ("" __VA_ARGS__), depth, this); \ + do { \ + char const *symbol = ("" __VA_ARGS__); \ + if (std::string_view(symbol).empty()) { \ + symbol = __func__ + 5; \ + } \ + UnicodeString ahead; \ + std::string aheadUTF8; \ + printf("%s%s > %s\n", std::string(depth * 4, ' ').c_str(), symbol, \ + (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str()); \ + printf("%s\n", (UnicodeSet(*this) \ + .complement() \ + .complement() \ + .toPattern(ahead) \ + .toUTF8String(aheadUTF8) \ + .c_str(),"")); \ + } while (false) +#else +#define U_UNICODESET_TRACE(...) \ + do { \ + } while (false) +#endif + +#define U_UNICODESET_RETURN_IF_ERROR(ec) \ + do { \ + if (U_FAILURE(ec)) { \ + if (depth < 5) { \ + printf("--- at %s l. %d\n", __func__, __LINE__); \ + } else if (depth == 5 && std::string_view(__func__) == "parseUnicodeSet") { \ + printf("--- [...]\n"); \ + } \ + return; \ + } \ + } while (false) +#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec) \ + do { \ + std::string actualUTF8; \ + UnicodeString ahead; \ + std::string aheadUTF8; \ + printf("*** Expected %s, got '%s' %s\n", (expected), \ + UnicodeString(actual).toUTF8String(actualUTF8).c_str(), \ + (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str()); \ + printf("--- at %s l. %d\n", __func__, __LINE__); \ + (ec) = U_MALFORMED_SET; \ + return; \ + } while (false) + } // namespace /** @@ -266,59 +334,66 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { * @param options a bit mask of zero or more of the following: * IGNORE_SPACE, CASE. */ + +void UnicodeSet::applyPattern(RuleCharacterIterator &chars, + const SymbolTable *symbols, + UnicodeString &rebuiltPat, + uint32_t options, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + UErrorCode &ec) { + if (U_FAILURE(ec)) return; + clear(); + parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec); + _generatePattern(rebuiltPat, false); +} + void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, - UErrorCode& ec) { - if (U_FAILURE(ec)) return; + int32_t depth, UErrorCode &ec) { + U_UNICODESET_TRACE(); + if (depth > MAX_DEPTH) { - ec = U_ILLEGAL_ARGUMENT_ERROR; - return; + U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(), + ("depth = " + std::to_string(depth)).c_str(), chars, ec); } - // Syntax characters: [ ] ^ - & { } - - // Recognized special forms for chars, sets: c-c s-s s&s - - clear(); - bool isComplement = false; - if (resemblesPropertyPattern(chars, charsOptions(options))) { - // UnicodeSet ::= property-query | named-singleton - applyPropertyPattern(chars, rebuiltPat, ec); - if (U_FAILURE(ec)) return; + // UnicodeSet ::= property-query | named-element + U_UNICODESET_TRACE("property-query | named-element"); + chars.skipIgnored(charsOptions(options)); + UnicodeSet propertyQuery; + propertyQuery.applyPropertyPattern(chars, rebuiltPat, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + addAll(propertyQuery); } else { UBool escaped = false; // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. // UnicodeSet ::= [ Union ] // | Complement ::= [ ^ Union ] - char16_t c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; + UChar32 c = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); if (escaped || c != u'[') { - ec = U_MALFORMED_SET; - return; + U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); } RuleCharacterIterator::Pos afterBracket; chars.getPos(afterBracket); c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'^') { isComplement = true; - return; } else { chars.setPos(afterBracket); } parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); if (escaped || c != u']') { - ec = U_MALFORMED_SET; - return; + U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); } } @@ -342,6 +417,7 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { + U_UNICODESET_TRACE(); UBool escaped = false; RuleCharacterIterator::Pos position; chars.getPos(position); @@ -351,17 +427,17 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars, // | UnescapedHyphenMinus Terms UnescapedHyphenMinus // Terms ::= "" // | Terms Term - char16_t c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; + UChar32 c = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'-') { add(u'-'); } else { chars.setPos(position); } - for (;;) { + while (!chars.atEnd()) { chars.getPos(position); c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'-') { // We can be here on the first iteration: [--] is allowed by the // grammar and by the old parser. @@ -372,9 +448,8 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars, if (!escaped && c == ']') { return; } - if (U_FAILURE(ec)) return; - parseTerm(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec); - if (U_FAILURE(ec)) return; + parseTerm(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); } } @@ -385,17 +460,20 @@ void UnicodeSet::parseTerm(RuleCharacterIterator &chars, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { + U_UNICODESET_TRACE(); UBool escaped = false; RuleCharacterIterator::Pos termStart; chars.getPos(termStart); // Term ::= Elements // | Restriction - char16_t c = chars.next(charsOptions(options), escaped, ec); - if (!escaped && c == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { - chars.setPos(termStart); - parseRestriction(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth, ec); - if (U_FAILURE(ec)) return; + const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); + chars.setPos(termStart); + if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { + parseRestriction(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); } else { + parseElements(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); } } @@ -405,34 +483,37 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { + U_UNICODESET_TRACE(); UBool escaped = false; // Restriction ::= UnicodeSet // | Intersection ::= Restriction & UnicodeSet // | Difference ::= Restriction - UnicodeSet // Start by parsing the first UnicodeSet. - parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, depth + 1, ec); - if (U_FAILURE(ec)) return; + parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); // Now keep looking for an operator that would continue the Restriction. + // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and + // return. for (;;) { RuleCharacterIterator::Pos beforeOperator; chars.getPos(beforeOperator); - char16_t c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; - if (!escaped && c == u'&') { + const UChar32 op = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped && op == u'&') { // Intersection ::= Restriction & UnicodeSet UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, + rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); retainAll(rightHandSide); - } else if (!escaped && c == u'-') { + } else if (!escaped && op == u'-') { // Here the grammar requires two tokens of lookahead to figure out whether the - the operator // of a Difference or an UnescapedHyphenMinus in the enclosing Union. RuleCharacterIterator::Pos afterOperator; chars.getPos(afterOperator); - char16_t c = chars.next(charsOptions(options), escaped, ec); - if (U_FAILURE(ec)) return; - if (!escaped && c == u']') { + const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped && ahead == u']') { // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it. chars.setPos(beforeOperator); return; @@ -440,12 +521,12 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, chars.setPos(afterOperator); // Difference ::= Restriction - UnicodeSet UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, charsOptions(options), caseClosure, + rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); - if (U_FAILURE(ec)) return; + U_UNICODESET_RETURN_IF_ERROR(ec); removeAll(rightHandSide); } else { - // Not an operator. + // Not an operator, end of the Restriction. chars.setPos(beforeOperator); return; } @@ -459,8 +540,89 @@ void UnicodeSet::parseElements(RuleCharacterIterator &chars, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { + U_UNICODESET_TRACE(); + // Elements ::= Element + // | Range + // Range ::= RangeElement - RangeElement + // RangeElement ::= literal-element + // | escaped-element + // Element ::= RangeElement + // | string-literal UBool escaped = false; - + const UChar32 first = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped) { + switch (first) { + case u'-': + case u'&': + case u'[': + case u']': + case u'^': + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec); + case u'{': { + UnicodeString string; + UChar32 c; + while (!chars.atEnd()) { + c = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped && c == u'}') { + add(string); + return; + } + string.append(c); + } + U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec); + } + case u'}': + // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. + default: + break; + } + } + RuleCharacterIterator::Pos beforeOperator; + chars.getPos(beforeOperator); + const UChar32 op = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (escaped || op != u'-') { + // No operator, + // Elements ::= Element + chars.setPos(beforeOperator); + add(first); + return; + } + // Here the grammar requires two tokens of lookahead to figure out whether the - the operator + // of a Range or an UnescapedHyphenMinus in the enclosing Union. + const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped && ahead == u']') { + // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it. + chars.setPos(beforeOperator); + add(first); + return; + } + const UChar32 last = ahead; + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped) { + switch (last) { + case u'-': + case u'&': + case u'[': + case u']': + case u'^': + case u'{': + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec); + case u'}': + // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. + default: + break; + } + } + if (last <= first) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range", + UnicodeString(last) + u"-" + UnicodeString(first), chars, ec); + } + add(first, last); + return; } #if 0 From 07ab1c1c4791655c46e02449fc3e44935822b081 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 04/56] ICU-22851 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 44 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 45 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index da32687987e8..fa9c00897865 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,49 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto [expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From b489fa0622a03a021877268197d95427bbe160a4 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 23:10:20 +0200 Subject: [PATCH 05/56] call it a day --- icu4c/source/common/unicode/uniset.h | 18 +++++--- icu4c/source/common/uniset_closure.cpp | 2 +- icu4c/source/common/uniset_props.cpp | 58 +++++++++++++++++--------- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index c4c96154fca2..96a9f4f9f749 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1696,7 +1696,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { const SymbolTable* symbols, UErrorCode& status); - void applyPattern(RuleCharacterIterator &chars, + void applyPattern(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -1708,7 +1709,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { // applied). They add to *this the elements of the set that the parsed construct represents. // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. - void parseUnicodeSet(RuleCharacterIterator &chars, + void parseUnicodeSet(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -1716,7 +1718,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { int32_t depth, UErrorCode &ec); - void parseUnion(RuleCharacterIterator &chars, + void parseUnion(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -1724,7 +1727,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { int32_t depth, UErrorCode &ec); - void parseTerm(RuleCharacterIterator &chars, + void parseTerm(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -1732,7 +1736,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { int32_t depth, UErrorCode &ec); - void parseRestriction(RuleCharacterIterator &chars, + void parseRestriction(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -1740,7 +1745,8 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { int32_t depth, UErrorCode &ec); - void parseElements(RuleCharacterIterator &chars, + void parseElements(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index 251276adaddb..05e9b0a37e04 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); + applyPattern(pattern, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); if (U_FAILURE(status)) return *this; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 054a1a932d03..fa3c9070831b 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -196,7 +196,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); + applyPattern(pattern, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); @@ -296,10 +296,12 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { #define U_UNICODESET_RETURN_IF_ERROR(ec) \ do { \ + constexpr std::string_view functionName = __func__;\ + static_assert (functionName.substr(0, 5) == "parse");\ if (U_FAILURE(ec)) { \ if (depth < 5) { \ - printf("--- at %s l. %d\n", __func__, __LINE__); \ - } else if (depth == 5 && std::string_view(__func__) == "parseUnicodeSet") { \ + printf("--- in %s l. %d\n", __func__+5, __LINE__); \ + } else if (depth == 5 && std::string_view(__func__+5) == "UnicodeSet") { \ printf("--- [...]\n"); \ } \ return; \ @@ -307,13 +309,22 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { } while (false) #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec) \ do { \ + constexpr std::string_view functionName = __func__; \ + static_assert(functionName.substr(0, 5) == "parse");\ std::string actualUTF8; \ UnicodeString ahead; \ std::string aheadUTF8; \ - printf("*** Expected %s, got '%s' %s\n", (expected), \ + std::string behindUTF8; \ + (chars).lookahead(ahead); \ + printf("*** Expected %s, got '%s' %s☜%s\n", (expected), \ UnicodeString(actual).toUTF8String(actualUTF8).c_str(), \ - (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str()); \ - printf("--- at %s l. %d\n", __func__, __LINE__); \ + pattern.tempSubString(0, pattern.length() - ahead.length()) \ + .toUTF8String(behindUTF8) \ + .c_str(), \ + pattern.tempSubString(pattern.length() - ahead.length(), 60) \ + .toUTF8String(aheadUTF8) \ + .c_str()); \ + printf("--- in %s l. %d\n", __func__ + 5, __LINE__); \ (ec) = U_MALFORMED_SET; \ return; \ } while (false) @@ -323,6 +334,7 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { /** * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. + * @param pattern The pattern, only used by debug traces. * @param chars iterator over the pattern characters. Upon return * it will be advanced to the first character after the parsed * pattern, or the end of the iteration if all characters are @@ -335,7 +347,8 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { * IGNORE_SPACE, CASE. */ -void UnicodeSet::applyPattern(RuleCharacterIterator &chars, +void UnicodeSet::applyPattern(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -343,11 +356,12 @@ void UnicodeSet::applyPattern(RuleCharacterIterator &chars, UErrorCode &ec) { if (U_FAILURE(ec)) return; clear(); - parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec); + parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec); _generatePattern(rebuiltPat, false); } -void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars, +void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable* symbols, UnicodeString& rebuiltPat, uint32_t options, @@ -388,7 +402,7 @@ void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars, } else { chars.setPos(afterBracket); } - parseUnion(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseUnion(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); @@ -411,7 +425,8 @@ void UnicodeSet::parseUnicodeSet(RuleCharacterIterator& chars, } } -void UnicodeSet::parseUnion(RuleCharacterIterator &chars, +void UnicodeSet::parseUnion(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -448,12 +463,13 @@ void UnicodeSet::parseUnion(RuleCharacterIterator &chars, if (!escaped && c == ']') { return; } - parseTerm(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } -void UnicodeSet::parseTerm(RuleCharacterIterator &chars, +void UnicodeSet::parseTerm(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -469,15 +485,16 @@ void UnicodeSet::parseTerm(RuleCharacterIterator &chars, const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); chars.setPos(termStart); if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { - parseRestriction(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } else { - parseElements(chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseElements(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } -void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, +void UnicodeSet::parseRestriction(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, @@ -489,7 +506,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, // | Intersection ::= Restriction & UnicodeSet // | Difference ::= Restriction - UnicodeSet // Start by parsing the first UnicodeSet. - parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); + parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); // Now keep looking for an operator that would continue the Restriction. // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and @@ -502,7 +519,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, if (!escaped && op == u'&') { // Intersection ::= Restriction & UnicodeSet UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, + rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); retainAll(rightHandSide); @@ -521,7 +538,7 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, chars.setPos(afterOperator); // Difference ::= Restriction - UnicodeSet UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(chars, symbols, rebuiltPat, options, caseClosure, + rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); removeAll(rightHandSide); @@ -533,7 +550,8 @@ void UnicodeSet::parseRestriction(RuleCharacterIterator &chars, } } -void UnicodeSet::parseElements(RuleCharacterIterator &chars, +void UnicodeSet::parseElements(const UnicodeString &pattern, + RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, From e147b1564bc1ed06a1d48fcfa95d8a0fdda9d5ad Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 14:39:35 +0200 Subject: [PATCH 06/56] Pattern-rebuilding logic --- icu4c/source/common/unicode/uniset.h | 2 ++ icu4c/source/common/uniset_props.cpp | 48 +++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 96a9f4f9f749..2d73df2fcdac 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1725,6 +1725,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, + bool &containsRestrictions, UErrorCode &ec); void parseTerm(const UnicodeString &pattern, @@ -1734,6 +1735,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, + bool &containsRestriction, UErrorCode &ec); void parseRestriction(const UnicodeString &pattern, diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index fa3c9070831b..450c93712520 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -355,9 +355,7 @@ void UnicodeSet::applyPattern(const UnicodeString &pattern, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode &ec) { if (U_FAILURE(ec)) return; - clear(); parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec); - _generatePattern(rebuiltPat, false); } void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, @@ -367,6 +365,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { + clear(); U_UNICODESET_TRACE(); if (depth > MAX_DEPTH) { @@ -375,14 +374,21 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, } bool isComplement = false; + // Whether to keep the syntax of the pattern at this level, only doing basic pretty-printing, e.g., + // turn [ c - z[a]a - b ] into [c-z[a]a-b], but not into [a-z]. + // This is true for a property query, or when there is a nested set. Note that since we recurse, + // innermost sets consisting only of ranges will get simplified. + bool preserveSyntaxInPattern = false; + UnicodeString syntacticallyFaithfulPattern; if (resemblesPropertyPattern(chars, charsOptions(options))) { // UnicodeSet ::= property-query | named-element U_UNICODESET_TRACE("property-query | named-element"); chars.skipIgnored(charsOptions(options)); UnicodeSet propertyQuery; - propertyQuery.applyPropertyPattern(chars, rebuiltPat, ec); + propertyQuery.applyPropertyPattern(chars, syntacticallyFaithfulPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); addAll(propertyQuery); + preserveSyntaxInPattern = true; } else { UBool escaped = false; // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. @@ -393,22 +399,26 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, if (escaped || c != u'[') { U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); } + syntacticallyFaithfulPattern.append(u'['); RuleCharacterIterator::Pos afterBracket; chars.getPos(afterBracket); c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'^') { + syntacticallyFaithfulPattern.append(u'^'); isComplement = true; } else { chars.setPos(afterBracket); } - parseUnion(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseUnion(pattern, chars, symbols, syntacticallyFaithfulPattern, options, caseClosure, depth, + /*containsRestrictions=*/preserveSyntaxInPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (escaped || c != u']') { U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); } + syntacticallyFaithfulPattern.append(u']'); } /** @@ -423,6 +433,11 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, if (isComplement) { complement().removeAllStrings(); // code point complement } + if (preserveSyntaxInPattern) { + rebuiltPat.append(syntacticallyFaithfulPattern); + } else { + _generatePattern(rebuiltPat, /*escapeUnprintable=*/false); + } } void UnicodeSet::parseUnion(const UnicodeString &pattern, @@ -430,7 +445,9 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, const SymbolTable *symbols, UnicodeString &rebuiltPat, uint32_t options, - UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, + UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), + int32_t depth, + bool &containsRestrictions, UErrorCode &ec) { U_UNICODESET_TRACE(); UBool escaped = false; @@ -446,6 +463,9 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'-') { add(u'-'); + // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a + // final one, for consistency with older ICU behaviour. + rebuiltPat.append(u"\\-"); } else { chars.setPos(position); } @@ -456,6 +476,7 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, if (!escaped && c == u'-') { // We can be here on the first iteration: [--] is allowed by the // grammar and by the old parser. + rebuiltPat.append(u'-'); add(u'-'); return; } @@ -463,7 +484,8 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, if (!escaped && c == ']') { return; } - parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions, + ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } @@ -475,6 +497,7 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, + bool &containsRestriction, UErrorCode &ec) { U_UNICODESET_TRACE(); UBool escaped = false; @@ -485,6 +508,7 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern, const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); chars.setPos(termStart); if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { + containsRestriction = true; parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } else { @@ -506,7 +530,10 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern, // | Intersection ::= Restriction & UnicodeSet // | Difference ::= Restriction - UnicodeSet // Start by parsing the first UnicodeSet. - parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); + UnicodeSet leftHandSide; + leftHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, + ec); + addAll(leftHandSide); U_UNICODESET_RETURN_IF_ERROR(ec); // Now keep looking for an operator that would continue the Restriction. // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and @@ -518,6 +545,7 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern, U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && op == u'&') { // Intersection ::= Restriction & UnicodeSet + rebuiltPat.append(u'&'); UnicodeSet rightHandSide; rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); @@ -537,6 +565,7 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern, } chars.setPos(afterOperator); // Difference ::= Restriction - UnicodeSet + rebuiltPat.append(u'-'); UnicodeSet rightHandSide; rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, ec); @@ -597,6 +626,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, break; } } + _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false); RuleCharacterIterator::Pos beforeOperator; chars.getPos(beforeOperator); const UChar32 op = chars.next(charsOptions(options), escaped, ec); @@ -618,8 +648,9 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, add(first); return; } + // Elements ::= Range ::= RangeElement - RangeElement + rebuiltPat.append(u'-'); const UChar32 last = ahead; - U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped) { switch (last) { case u'-': @@ -635,6 +666,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, break; } } + _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false); if (last <= first) { U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), chars, ec); From f47cac412d7ea174fc336e1ab28a8dd0c1623fa8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 14:48:52 +0200 Subject: [PATCH 07/56] More tests of toPattern --- icu4c/source/test/intltest/usettest.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index fa9c00897865..0262aca5b0ca 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4351,11 +4351,14 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - - ]", uR"([\-])"}, {u"[ - _ - ]", uR"([\-_])"}, {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, {uR"(\p{gc=P})", uR"(\p{gc=P})"}, {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are @@ -4367,6 +4370,14 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); From e3efa59d4940d63b3280cca3124de39aba4b9709 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 08/56] ICU-22851 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 56 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index da32687987e8..0262aca5b0ca 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto [expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From a7a403581e2056450fad2ed77bf00f75d14b29f8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:01:03 +0200 Subject: [PATCH 09/56] Print strings --- icu4c/source/common/uniset_props.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 450c93712520..562ce16db9f0 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -379,13 +379,14 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, // This is true for a property query, or when there is a nested set. Note that since we recurse, // innermost sets consisting only of ranges will get simplified. bool preserveSyntaxInPattern = false; - UnicodeString syntacticallyFaithfulPattern; + // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc. + UnicodeString prettyPrintedPattern; if (resemblesPropertyPattern(chars, charsOptions(options))) { // UnicodeSet ::= property-query | named-element U_UNICODESET_TRACE("property-query | named-element"); chars.skipIgnored(charsOptions(options)); UnicodeSet propertyQuery; - propertyQuery.applyPropertyPattern(chars, syntacticallyFaithfulPattern, ec); + propertyQuery.applyPropertyPattern(chars, prettyPrintedPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); addAll(propertyQuery); preserveSyntaxInPattern = true; @@ -399,18 +400,18 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, if (escaped || c != u'[') { U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); } - syntacticallyFaithfulPattern.append(u'['); + prettyPrintedPattern.append(u'['); RuleCharacterIterator::Pos afterBracket; chars.getPos(afterBracket); c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'^') { - syntacticallyFaithfulPattern.append(u'^'); + prettyPrintedPattern.append(u'^'); isComplement = true; } else { chars.setPos(afterBracket); } - parseUnion(pattern, chars, symbols, syntacticallyFaithfulPattern, options, caseClosure, depth, + parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth, /*containsRestrictions=*/preserveSyntaxInPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); c = chars.next(charsOptions(options), escaped, ec); @@ -418,7 +419,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, if (escaped || c != u']') { U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); } - syntacticallyFaithfulPattern.append(u']'); + prettyPrintedPattern.append(u']'); } /** @@ -434,7 +435,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, complement().removeAllStrings(); // code point complement } if (preserveSyntaxInPattern) { - rebuiltPat.append(syntacticallyFaithfulPattern); + rebuiltPat.append(prettyPrintedPattern); } else { _generatePattern(rebuiltPat, /*escapeUnprintable=*/false); } @@ -607,15 +608,18 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, case u'^': U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec); case u'{': { + rebuiltPat.append(u'{'); UnicodeString string; UChar32 c; while (!chars.atEnd()) { c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (!escaped && c == u'}') { + rebuiltPat.append(u'}'); add(string); return; } + _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false); string.append(c); } U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec); From cef298e85d270f946da33c7a64d787c64eb4a004 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:03:59 +0200 Subject: [PATCH 10/56] Appease the warnings even though these are string_views --- icu4c/source/test/intltest/usettest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 0262aca5b0ca..3b0e1dc32fe1 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4338,7 +4338,7 @@ void UnicodeSetTest::TestElementIterator() { } void UnicodeSetTest::TestToPatternOutput() { - for (const auto [expression, expected] : + for (const auto &[expression, expected] : std::vector>{ // For a UnicodeSet which is not a property-query nor a named-element and without any // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements From b4e365b91e47287f320e2eeb3fdd675cf62992e1 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 11/56] ICU-22851 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 56 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index da32687987e8..3b0e1dc32fe1 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto &[expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From cef20932f17429a5dab01a14db63266577b075b7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:51:47 +0200 Subject: [PATCH 12/56] ICU-22851 Test various edge cases with $ in the absence of variables --- icu4c/source/test/intltest/usettest.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 3b0e1dc32fe1..55a23782337a 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4353,6 +4353,10 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - + - ]", uR"([+\-])"}, {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4378,6 +4382,8 @@ void UnicodeSetTest::TestToPatternOutput() { // A named-element is currently a nested set, so it is preserved and causes the syntax to be // preserved. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4416,6 +4422,7 @@ void UnicodeSetTest::TestParseErrors() { u"[{aa]", // "Unquoted '$'". u"[a-$]", + u"[!-$]", // "Invalid range". u"[a-a]", // TODO(egg): Exclude in PDUTS61. u"[z-a]", From 9e126dda9735aef35de47882c4bbaca791eaed65 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 16:13:23 +0200 Subject: [PATCH 13/56] $ handling --- icu4c/source/common/uniset_props.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 562ce16db9f0..6054c32c2283 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -480,6 +480,18 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, rebuiltPat.append(u'-'); add(u'-'); return; + } else if (!escaped && c == u'$') { + RuleCharacterIterator::Pos afterDollar; + chars.getPos(afterDollar); + c = chars.next(charsOptions(options), escaped, ec); + if (!escaped && c == u']') { + // An unescaped $ at the end of a Union is an anchor. + rebuiltPat.append(u'$'); + chars.setPos(afterDollar); + add(U_ETHER); + containsRestrictions = true; + return; + } } chars.setPos(position); if (!escaped && c == ']') { @@ -607,6 +619,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, case u']': case u'^': U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec); + // Unescaped '$' case u'{': { rebuiltPat.append(u'{'); UnicodeString string; @@ -625,6 +638,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec); } case u'}': + case u'$': // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. default: break; @@ -664,6 +678,19 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, case u'^': case u'{': U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec); + case u'$': { + // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union. + // This is an extension. + RuleCharacterIterator::Pos afterDollar; + chars.getPos(afterDollar); + UChar32 c = chars.next(charsOptions(options), escaped, ec); + chars.setPos(afterDollar); + if (!escaped && c == u']') { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", c, chars, + ec); + } + break; + } case u'}': // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. default: From c8d2b9eb852ceb1367d809b858e30b691ed81ab7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 16:15:01 +0200 Subject: [PATCH 14/56] comment --- icu4c/source/common/uniset_props.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 6054c32c2283..3984ee788ee8 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -485,7 +485,8 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, chars.getPos(afterDollar); c = chars.next(charsOptions(options), escaped, ec); if (!escaped && c == u']') { - // An unescaped $ at the end of a Union is an anchor. + // ICU extensions: A $ is allowed as a literal-element. + // A Term at the end of a Union consisting of a single $ is an anchor. rebuiltPat.append(u'$'); chars.setPos(afterDollar); add(U_ETHER); From bbcc2316c31339d198fbc501823948c7b2302aa0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 17:07:35 +0200 Subject: [PATCH 15/56] ICU-22851 Even more $ edge cases --- icu4c/source/test/intltest/usettest.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 55a23782337a..5415940918ad 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4357,6 +4357,8 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[a-c$d-z]", uR"([\$a-z])"}, {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4374,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, // Spaces are eliminated within a string-literal even when the syntax is preserved. {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, // Escapes are removed even when the syntax is preserved. @@ -4384,6 +4387,8 @@ void UnicodeSetTest::TestToPatternOutput() { {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, // An anchor also causes the syntax to be preserved. {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); From 876d338542643b2607d58922ae6f748331d544a3 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:51:47 +0200 Subject: [PATCH 16/56] ICU-22851 Test various edge cases with $ in the absence of variables --- icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 3b0e1dc32fe1..5415940918ad 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - + - ]", uR"([+\-])"}, {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, // Spaces are eliminated within a string-literal even when the syntax is preserved. {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, // Escapes are removed even when the syntax is preserved. @@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() { // A named-element is currently a nested set, so it is preserved and causes the syntax to be // preserved. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() { u"[{aa]", // "Unquoted '$'". u"[a-$]", + u"[!-$]", // "Invalid range". u"[a-a]", // TODO(egg): Exclude in PDUTS61. u"[z-a]", From a6d9182ebab5f44249434128ef9c9627c817d2cc Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 20:53:20 +0200 Subject: [PATCH 17/56] ICU-22851 Test UnicodeSet with lookupMatcher --- icu4c/source/test/intltest/usettest.cpp | 203 +++++++++++++++++++++++- icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 203 insertions(+), 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..06de9e315aac 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestEscapePattern); TESTCASE_AUTO(TestInvalidCodePoint); TESTCASE_AUTO(TestSymbolTable); + TESTCASE_AUTO(TestLookupSymbolTable); TESTCASE_AUTO(TestSurrogate); TESTCASE_AUTO(TestPosixClasses); TESTCASE_AUTO(TestIteration); @@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() { // Multiple test cases can be set up here. Each test case // is terminated by null: // var, value, var, value,..., input pat., exp. output pat., null - const char* DATA[] = { + const char *DATA[] = { "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr, "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr, "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr, + // Things that probably should not work, but currently do: + "open", "[", "$open a-z]", "[a-z]", nullptr, + "open", "[", "close", "]", "hyphenMinus", "-", + "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]", + "[[a-z]-[c-z]-]", nullptr, + "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr, + "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr, + "smiling", ":-]", "laughing", ":-D", + "[ {$smiling} $laughing $smiling", + R"([\-\:-D{\:\-\]}])", nullptr, nullptr }; @@ -1811,6 +1823,195 @@ void UnicodeSetTest::TestSymbolTable() { logln(UnicodeString("Ok, got ") + us.toPattern(a, true)); } } + for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : + std::vector>, + std::u16string_view, UErrorCode, std::u16string_view>>{ + // You should not do this, but it works. + {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}}, + u"$privateUseOrUnassigned$close", + U_ZERO_ERROR, + u"[[:Co:][:Cn:]]"}, + // This works and it is fine. + {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"}, + // This should work! But it does not. Note the doubled brackets on the one that works above. + // We are not yet inside the variable when we call lookahead(), so we try to parse + // $privateUse rather than [:Co:]. + {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"}, + // This should not work, and it does not (we try to parse [$sad$surprised] as a + // property-query). + {{{u"sad", u":C"}, {u"surprised", u"o:"}}, + u"[$sad$surprised]", + U_ILLEGAL_ARGUMENT_ERROR, + u"[]"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + TokenSymbolTable symbols(errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t construct symbol table"); + continue; + } + for (const auto &[name, value] : variables) { + symbols.add(name, value, errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t add variable " + name); + continue; + } + } + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + UnicodeString actual; + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + } +} + +void UnicodeSetTest::TestLookupSymbolTable() { + UErrorCode errorCode = U_ZERO_ERROR; + class TestSymbolTable : public SymbolTable { + public: + const UnicodeString *lookup(const UnicodeString &) const override { + return nullptr; + } + + const UnicodeFunctor *lookupMatcher(UChar32 c) const override { + return symbols_.find(c) != symbols_.end() ? &symbols_.at(c) + : nullptr; + } + + virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &, + int32_t) const override { + return u""; + } + + void add(UChar32 c, UnicodeSet set) { + symbols_[c] = set; + } + + private: + std::unordered_map symbols_; + }; + TestSymbolTable symbols; + symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); + symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); + symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector>{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + // Substitution of lookupMatcher symbols takes place after de-escaping. + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + // It does not take place in string literals. + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, + {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea + // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not + // change it unknowingly. + symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode)); + symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode)); + // This one is never used, except if escaped. + symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode)); + symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode)); + symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); + symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); + symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector< + std::tuple>{ + {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + // The hyphen no longer works as set difference. + {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, + {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, + // String literals no longer work. + {uR"([!-/{0}])", U_ZERO_ERROR, + u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", + u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, + // The ampersand no longer works as set difference. + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", + u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, + // Complementing still works. + {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])", + uR"([\u0001-\U0010FFFF])"}, + // ^ elsewhere becomes a symbol rather than a syntax error. + {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])", + uR"([\u0000{circumflexAccent}{hyphenMinus}])"}, + // Opening brackets still work. + {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"}, + // The only way to access the [ symbol is via escaping. + {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, + // Anchors are gone. + {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the + // constructor returns an error but not an empty set. Don’t do that. + symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector< + std::tuple>{ + {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } } void UnicodeSetTest::TestSurrogate() { diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..32abf828a30a 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest { void TestInvalidCodePoint(); void TestSymbolTable(); + void TestLookupSymbolTable(); void TestSurrogate(); From e81735cc26740ebf9e2620354de2b24c91f76767 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 15 Aug 2025 16:57:09 +0200 Subject: [PATCH 18/56] Something that works in the same silly way as it used to. --- icu4c/source/common/uniset_props.cpp | 114 +++++++++++++++++---------- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 3984ee788ee8..8c4b13f18e71 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -250,6 +250,13 @@ constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { return opts; } +const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar32 c) { + if (symbols == nullptr) { + return nullptr; + } + return dynamic_cast(symbols->lookupMatcher(c)); +} + #if 0 #define U_UNICODESET_TRACE(...) \ struct UnicodeSetParserTrace { \ @@ -395,31 +402,43 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. // UnicodeSet ::= [ Union ] // | Complement ::= [ ^ Union ] + // Extension: + // | MatcherSymbol + // Where a MatcherSymbol may be a character or an escape. + // Strings that would match MatcherSymbol effectively get removed from + // all other terminals of the grammar, except [. UChar32 c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (escaped || c != u'[') { - U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); - } - prettyPrintedPattern.append(u'['); - RuleCharacterIterator::Pos afterBracket; - chars.getPos(afterBracket); - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'^') { - prettyPrintedPattern.append(u'^'); - isComplement = true; + if (!escaped && c == u'[') { + prettyPrintedPattern.append(u'['); + RuleCharacterIterator::Pos afterBracket; + chars.getPos(afterBracket); + c = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (!escaped && c == u'^') { + prettyPrintedPattern.append(u'^'); + isComplement = true; + } else { + chars.setPos(afterBracket); + } + parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth, + /*containsRestrictions=*/preserveSyntaxInPattern, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + c = chars.next(charsOptions(options), escaped, ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (escaped || c != u']') { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); + } + prettyPrintedPattern.append(u']'); } else { - chars.setPos(afterBracket); - } - parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth, - /*containsRestrictions=*/preserveSyntaxInPattern, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (escaped || c != u']') { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); + const UnicodeSet *set = getMatcherSymbol(symbols, c); + if (set != nullptr) { + *this = *set; + this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false); + return; + } + U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); } - prettyPrintedPattern.append(u']'); } /** @@ -462,7 +481,7 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, // | Terms Term UChar32 c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'-') { + if (!escaped && c == u'-' && getMatcherSymbol(symbols, c)) { add(u'-'); // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a // final one, for consistency with older ICU behaviour. @@ -474,28 +493,30 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, chars.getPos(position); c = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'-') { - // We can be here on the first iteration: [--] is allowed by the - // grammar and by the old parser. - rebuiltPat.append(u'-'); - add(u'-'); - return; - } else if (!escaped && c == u'$') { - RuleCharacterIterator::Pos afterDollar; - chars.getPos(afterDollar); - c = chars.next(charsOptions(options), escaped, ec); - if (!escaped && c == u']') { - // ICU extensions: A $ is allowed as a literal-element. - // A Term at the end of a Union consisting of a single $ is an anchor. - rebuiltPat.append(u'$'); - chars.setPos(afterDollar); - add(U_ETHER); - containsRestrictions = true; + if (getMatcherSymbol(symbols, c) == nullptr) { + if (!escaped && c == u'-') { + // We can be here on the first iteration: [--] is allowed by the + // grammar and by the old parser. + rebuiltPat.append(u'-'); + add(u'-'); return; + } else if (!escaped && c == u'$') { + RuleCharacterIterator::Pos afterDollar; + chars.getPos(afterDollar); + c = chars.next(charsOptions(options), escaped, ec); + if (!escaped && c == u']') { + // ICU extensions: A $ is allowed as a literal-element. + // A Term at the end of a Union consisting of a single $ is an anchor. + rebuiltPat.append(u'$'); + chars.setPos(afterDollar); + add(U_ETHER); + containsRestrictions = true; + return; + } } } chars.setPos(position); - if (!escaped && c == ']') { + if (!escaped && c == ']' && getMatcherSymbol(symbols, c) == nullptr) { return; } parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions, @@ -521,7 +542,8 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern, // | Restriction const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); chars.setPos(termStart); - if (!escaped && ahead == '[' || resemblesPropertyPattern(chars, charsOptions(options))) { + if (getMatcherSymbol(symbols, ahead) != nullptr || !escaped && ahead == '[' || + resemblesPropertyPattern(chars, charsOptions(options))) { containsRestriction = true; parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); @@ -557,6 +579,11 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern, chars.getPos(beforeOperator); const UChar32 op = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); + if (getMatcherSymbol(symbols, op)) { + // Not an operator, end of the Restriction. + chars.setPos(beforeOperator); + return; + } if (!escaped && op == u'&') { // Intersection ::= Restriction & UnicodeSet rebuiltPat.append(u'&'); @@ -650,7 +677,7 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, chars.getPos(beforeOperator); const UChar32 op = chars.next(charsOptions(options), escaped, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (escaped || op != u'-') { + if (escaped || op != u'-' || getMatcherSymbol(symbols, op) != nullptr) { // No operator, // Elements ::= Element chars.setPos(beforeOperator); @@ -670,6 +697,9 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, // Elements ::= Range ::= RangeElement - RangeElement rebuiltPat.append(u'-'); const UChar32 last = ahead; + if (getMatcherSymbol(symbols, last) != nullptr) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec); + } if (!escaped) { switch (last) { case u'-': From 4beef14bc60d21091c3c7e3d8be6efcc46535ce9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 13:59:00 +0200 Subject: [PATCH 19/56] indentation on the parse error tests --- icu4c/source/test/intltest/usettest.cpp | 80 ++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 5415940918ad..c5de484f9100 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4404,39 +4404,39 @@ void UnicodeSetTest::TestToPatternOutput() { void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ - // Java error message: "Char expected after operator". - u"[a-[b]]", - // "Missing '['". - u"a-z", - // "Trailing '&'". - u"[[a]&]", - // "'-' not after char or set". - u"[[a]&-[z]]", - u"[[a]--[z]]", - u"[{aa}-{zz}]", - // "'&' not after set". - u"[a&z]", - u"[{aa}&{zz}]", - // "'^' not after '['" - u"[a^z]", // TODO(egg): Exclude from literal-element in PDUTS61. - // "Missing operand after operator". - u"[a-{zz}]", - u"[[a]-{zz}]", - u"[[a]&{zz}]", - // "Invalid multicharacter string". - u"[{aa]", - // "Unquoted '$'". - u"[a-$]", - u"[!-$]", - // "Invalid range". - u"[a-a]", // TODO(egg): Exclude in PDUTS61. - u"[z-a]", - // "Set expected after operator". - u"[[a]-z]", - u"[[a]&z]", - // "Missing ']'". - u"[a-z", - }) { + // Java error message: "Char expected after operator". + u"[a-[b]]", + // "Missing '['". + u"a-z", + // "Trailing '&'". + u"[[a]&]", + // "'-' not after char or set". + u"[[a]&-[z]]", + u"[[a]--[z]]", + u"[{aa}-{zz}]", + // "'&' not after set". + u"[a&z]", + u"[{aa}&{zz}]", + // "'^' not after '['" + u"[a^z]", // TODO(egg): Exclude from literal-element in PDUTS61. + // "Missing operand after operator". + u"[a-{zz}]", + u"[[a]-{zz}]", + u"[[a]&{zz}]", + // "Invalid multicharacter string". + u"[{aa]", + // "Unquoted '$'". + u"[a-$]", + u"[!-$]", + // "Invalid range". + u"[a-a]", // TODO(egg): Exclude in PDUTS61. + u"[z-a]", + // "Set expected after operator". + u"[[a]-z]", + u"[[a]&z]", + // "Missing ']'". + u"[a-z", + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); if (errorCode != U_MALFORMED_SET) { @@ -4446,13 +4446,13 @@ void UnicodeSetTest::TestParseErrors() { } } for (const auto expression : std::vector{ - // Java error message: "Invalid property pattern". - u"[:]", - uR"(\p)" - u"[:^]", - uR"(\P)", - uR"(\N)", - }) { + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) { From 18f2b7b7abdfc9c2f5f129ba0a0508298b03b4b5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 11 Aug 2025 16:24:10 +0200 Subject: [PATCH 20/56] ICU-22851 Test the error paths in UnicodeSet parsing --- icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 2 + 2 files changed, 63 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..76ab11424110 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() { // begin() & end() return USetElementIterator for which explicit APIs are tested via USet // in a header-only unit test file. } + +void UnicodeSetTest::TestParseErrors() { + for (const auto expression : std::vector{ + // Java error message: "Char expected after operator". + u"[a-[b]]", + // "Missing '['". + u"a-z", + // "Trailing '&'". + u"[[a]&]", + // "'-' not after char or set". + u"[[a]&-[z]]", + u"[[a]--[z]]", + u"[{aa}-{zz}]", + // "'&' not after set". + u"[a&z]", + u"[{aa}&{zz}]", + // "'^' not after '['" + u"[a^z]", // TODO(egg): Exclude from literal-element in PDUTS61. + // "Missing operand after operator". + u"[a-{zz}]", + u"[[a]-{zz}]", + u"[[a]&{zz}]", + // "Invalid multicharacter string". + u"[{aa]", + // "Unquoted '$'". + u"[a-$]", + // "Invalid range". + u"[a-a]", // TODO(egg): Exclude in PDUTS61. + u"[z-a]", + // "Set expected after operator". + u"[[a]-z]", + u"[[a]&z]", + // "Missing ']'". + u"[a-z", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_MALFORMED_SET) { + UnicodeString s; + errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } + for (const auto expression : std::vector{ + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) { + UnicodeString s; + errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..4c5b55a329bb 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestParseErrors(); + private: UBool toPatternAux(UChar32 start, UChar32 end); From 03f792b2c5a619d6e76b3975ee6551c475392ba7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 21/56] ICU-22851 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 56 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 76ab11424110..3b1a6012915e 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto &[expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From 8cc53b95074a4a806961c7b12849a30c7a97fac5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:51:47 +0200 Subject: [PATCH 22/56] ICU-22851 Test various edge cases with $ in the absence of variables --- icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 3b1a6012915e..c5de484f9100 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - + - ]", uR"([+\-])"}, {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, // Spaces are eliminated within a string-literal even when the syntax is preserved. {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, // Escapes are removed even when the syntax is preserved. @@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() { // A named-element is currently a nested set, so it is preserved and causes the syntax to be // preserved. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() { u"[{aa]", // "Unquoted '$'". u"[a-$]", + u"[!-$]", // "Invalid range". u"[a-a]", // TODO(egg): Exclude in PDUTS61. u"[z-a]", From 65fe08e56cb567da39ec2574f874001c1c80e644 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 14:05:48 +0200 Subject: [PATCH 23/56] dedent the pattern output test --- icu4c/source/test/intltest/usettest.cpp | 102 ++++++++++++------------ 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index c5de484f9100..89a15dcc489d 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4339,57 +4339,57 @@ void UnicodeSetTest::TestElementIterator() { void UnicodeSetTest::TestToPatternOutput() { for (const auto &[expression, expected] : - std::vector>{ - // For a UnicodeSet which is not a property-query nor a named-element and without any - // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements - // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to - // minimize the result. - {u"[c-za-b]", u"[a-z]"}, - {u"[ c - z a - b ]", u"[a-z]"}, - {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, - {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, - {u"[ - - ]", uR"([\-])"}, - {u"[ - _ - ]", uR"([\-_])"}, - {u"[ - + - ]", uR"([+\-])"}, - {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, - {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, - {u"[$d-za-c]", uR"([\$a-z])"}, - {u"[a-c$d-z]", uR"([\$a-z])"}, - {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, - {u"[!-$z]", uR"([!-\$z])"}, - {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, - {u"[-$-]", uR"([\$\-])"}, - // A property-query or named-element is kept as-is: - {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, - {uR"(\p{P})", uR"(\p{P})"}, - {uR"(\p{gc=P})", uR"(\p{gc=P})"}, - {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, - {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, - {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, - {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, - // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are - // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. - // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. - {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, - {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, - {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", - uR"([c-za-b\p{ General_Category = Punctuation }])"}, - {u"[^[c]]", uR"([^[c]])"}, - {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, - {u"[$[]]", uR"([\$[]])"}, - // Spaces are eliminated within a string-literal even when the syntax is preserved. - {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, - // Escapes are removed even when the syntax is preserved. - {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", - u"[{Zeichenkette}[]Zeichenmenge]"}, - // A named-element is currently a nested set, so it is preserved and causes the syntax to be - // preserved. - {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, - // An anchor also causes the syntax to be preserved. - {u"[ d-z a-c $ ]", u"[d-za-c$]"}, - {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, - {u"[$$$]", uR"([\$\$$])"}, - }) { + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); UnicodeString actual; From b478400a6d23cac79da5e39e5a20279cf797058e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 24/56] ICU-22851 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 56 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 76ab11424110..9e0e66fac3b1 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto &[expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From ae81d41bf8d775a37fcb0390e22372ef76815829 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:51:47 +0200 Subject: [PATCH 25/56] ICU-22851 Test various edge cases with $ in the absence of variables --- icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 9e0e66fac3b1..89a15dcc489d 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - + - ]", uR"([+\-])"}, {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, // Spaces are eliminated within a string-literal even when the syntax is preserved. {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, // Escapes are removed even when the syntax is preserved. @@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() { // A named-element is currently a nested set, so it is preserved and causes the syntax to be // preserved. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() { u"[{aa]", // "Unquoted '$'". u"[a-$]", + u"[!-$]", // "Invalid range". u"[a-a]", // TODO(egg): Exclude in PDUTS61. u"[z-a]", From 8eec9710279fdf276abe00ff20c23f758031e7a0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 11 Aug 2025 16:24:10 +0200 Subject: [PATCH 26/56] ICU-23179 Test the error paths in UnicodeSet parsing --- icu4c/source/test/intltest/usettest.cpp | 61 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 2 + 2 files changed, 63 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..76ab11424110 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4334,3 +4335,63 @@ void UnicodeSetTest::TestElementIterator() { // begin() & end() return USetElementIterator for which explicit APIs are tested via USet // in a header-only unit test file. } + +void UnicodeSetTest::TestParseErrors() { + for (const auto expression : std::vector{ + // Java error message: "Char expected after operator". + u"[a-[b]]", + // "Missing '['". + u"a-z", + // "Trailing '&'". + u"[[a]&]", + // "'-' not after char or set". + u"[[a]&-[z]]", + u"[[a]--[z]]", + u"[{aa}-{zz}]", + // "'&' not after set". + u"[a&z]", + u"[{aa}&{zz}]", + // "'^' not after '['" + u"[a^z]", // TODO(egg): Exclude from literal-element in PDUTS61. + // "Missing operand after operator". + u"[a-{zz}]", + u"[[a]-{zz}]", + u"[[a]&{zz}]", + // "Invalid multicharacter string". + u"[{aa]", + // "Unquoted '$'". + u"[a-$]", + // "Invalid range". + u"[a-a]", // TODO(egg): Exclude in PDUTS61. + u"[z-a]", + // "Set expected after operator". + u"[[a]-z]", + u"[[a]&z]", + // "Missing ']'". + u"[a-z", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_MALFORMED_SET) { + UnicodeString s; + errln(expression + u": Expected U_MALFORMED_SET, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } + for (const auto expression : std::vector{ + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) { + UnicodeString s; + errln(expression + u": Expected U_ILLEGAL_ARGUMENT_ERROR, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } +} diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..4c5b55a329bb 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,8 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestParseErrors(); + private: UBool toPatternAux(UChar32 start, UChar32 end); From dabce0b5dc10dcb05a8850869ef06659e478e386 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 13 Aug 2025 21:03:33 +0200 Subject: [PATCH 27/56] ICU-23179 Test the exact behaviour of UnicodeSet::toPattern --- icu4c/source/test/intltest/usettest.cpp | 55 +++++++++++++++++++++++++ icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 56 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 76ab11424110..9e0e66fac3b1 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -110,6 +110,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestRangeIterator); TESTCASE_AUTO(TestStringIterator); TESTCASE_AUTO(TestElementIterator); + TESTCASE_AUTO(TestToPatternOutput); TESTCASE_AUTO(TestParseErrors); TESTCASE_AUTO_END; } @@ -4336,6 +4337,60 @@ void UnicodeSetTest::TestElementIterator() { // in a header-only unit test file. } +void UnicodeSetTest::TestToPatternOutput() { + for (const auto &[expression, expected] : + std::vector>{ + // For a UnicodeSet which is not a property-query nor a named-element and without any + // Restriction among its Terms (that is, whose Union consists solely a sequence of Elements + // UnescapedHyphenMinus), toPattern merges and sorts ranges, and introduces a complement to + // minimize the result. + {u"[c-za-b]", u"[a-z]"}, + {u"[ c - z a - b ]", u"[a-z]"}, + {uR"([ ^ \u0000-b d-\U0010FFFF ])", u"[c]"}, + {uR"([ \u0000-b d-\U0010FFFF ])", u"[^c]"}, + {u"[ - - ]", uR"([\-])"}, + {u"[ - _ - ]", uR"([\-_])"}, + {u"[ - + - ]", uR"([+\-])"}, + {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, + {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + // A property-query or named-element is kept as-is: + {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, + {uR"(\p{P})", uR"(\p{P})"}, + {uR"(\p{gc=P})", uR"(\p{gc=P})"}, + {uR"([: general category = punctuation :])", uR"([: general category = punctuation :])"}, + {uR"([: ^general category = punctuation :])", uR"([: ^general category = punctuation :])"}, + {uR"(\P{ gc = punctuation })", uR"(\P{ gc = punctuation })"}, + {uR"(\N{ latin small letter a })", uR"(\N{ latin small letter a })"}, + // If there is any Restriction among the terms, its syntax is mostly as-is (spaces are + // still eliminated), with the exception that an initial UnescapedHyphenMinus gets escaped. + // This is applied recursively, so innermost ranges-only UnicodeSets get normalized. + {u"[ c-z a-b [c-f g-z] ]", u"[c-za-b[c-z]]"}, + {u"[- + c-z a-b [c-f g-z] -]", uR"([\-+c-za-b[c-z]-])"}, + {uR"([ c-z a-b \p{ General_Category = Punctuation } ])", + uR"([c-za-b\p{ General_Category = Punctuation }])"}, + {u"[^[c]]", uR"([^[c]])"}, + {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + // Spaces are eliminated within a string-literal even when the syntax is preserved. + {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, + // Escapes are removed even when the syntax is preserved. + {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", + u"[{Zeichenkette}[]Zeichenmenge]"}, + // A named-element is currently a nested set, so it is preserved and causes the syntax to be + // preserved. + {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + UnicodeString actual; + if (U_FAILURE(errorCode)) { + errln(u"Failed to parse " + expression + u": " + u_errorName(errorCode)); + } else if (set.toPattern(actual) != expected) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expected + ", got " + + actual); + } + } +} + void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ // Java error message: "Char expected after operator". diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 4c5b55a329bb..692aa8b9e84d 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -110,6 +110,7 @@ class UnicodeSetTest: public IntlTest { void TestStringIterator(); void TestElementIterator(); + void TestToPatternOutput(); void TestParseErrors(); private: From 6bd042524cf39ec264b6b1eca5e6e26ef73e0cba Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 14 Aug 2025 15:51:47 +0200 Subject: [PATCH 28/56] ICU-23179 Test various edge cases with $ in the absence of variables --- icu4c/source/test/intltest/usettest.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 9e0e66fac3b1..89a15dcc489d 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4353,6 +4353,12 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - + - ]", uR"([+\-])"}, {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, + {u"[$d-za-c]", uR"([\$a-z])"}, + {u"[a-c$d-z]", uR"([\$a-z])"}, + {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, + {u"[!-$z]", uR"([!-\$z])"}, + {u"[-a-cd-z$-]", uR"([\$\-a-z])"}, + {u"[-$-]", uR"([\$\-])"}, // A property-query or named-element is kept as-is: {uR"(\p{ General_Category = Punctuation })", uR"(\p{ General_Category = Punctuation })"}, {uR"(\p{P})", uR"(\p{P})"}, @@ -4370,6 +4376,7 @@ void UnicodeSetTest::TestToPatternOutput() { uR"([c-za-b\p{ General_Category = Punctuation }])"}, {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, + {u"[$[]]", uR"([\$[]])"}, // Spaces are eliminated within a string-literal even when the syntax is preserved. {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, // Escapes are removed even when the syntax is preserved. @@ -4378,6 +4385,10 @@ void UnicodeSetTest::TestToPatternOutput() { // A named-element is currently a nested set, so it is preserved and causes the syntax to be // preserved. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([\N{LATIN CAPITAL LETTER Z}eichenmenge])"}, + // An anchor also causes the syntax to be preserved. + {u"[ d-z a-c $ ]", u"[d-za-c$]"}, + {u"[ - a-c d-z $ ]", uR"([\-a-cd-z$])"}, + {u"[$$$]", uR"([\$\$$])"}, }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); @@ -4416,6 +4427,7 @@ void UnicodeSetTest::TestParseErrors() { u"[{aa]", // "Unquoted '$'". u"[a-$]", + u"[!-$]", // "Invalid range". u"[a-a]", // TODO(egg): Exclude in PDUTS61. u"[z-a]", From d6fc731e0ef8bce057cf77e30011fa3640c69be9 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 14:21:59 +0200 Subject: [PATCH 29/56] meow --- icu4c/source/test/intltest/usettest.cpp | 135 +++++++++++++----------- 1 file changed, 74 insertions(+), 61 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 06de9e315aac..4d52c95d0e4a 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -1823,27 +1823,35 @@ void UnicodeSetTest::TestSymbolTable() { logln(UnicodeString("Ok, got ") + us.toPattern(a, true)); } } - for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : - std::vector>, - std::u16string_view, UErrorCode, std::u16string_view>>{ - // You should not do this, but it works. - {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}}, - u"$privateUseOrUnassigned$close", - U_ZERO_ERROR, - u"[[:Co:][:Cn:]]"}, - // This works and it is fine. - {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"}, - // This should work! But it does not. Note the doubled brackets on the one that works above. - // We are not yet inside the variable when we call lookahead(), so we try to parse - // $privateUse rather than [:Co:]. - {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"}, - // This should not work, and it does not (we try to parse [$sad$surprised] as a - // property-query). - {{{u"sad", u":C"}, {u"surprised", u"o:"}}, - u"[$sad$surprised]", - U_ILLEGAL_ARGUMENT_ERROR, - u"[]"}, - }) { + struct TestCase { + struct Variable { + std::u16string_view name; + std::u16string_view value; + }; + std::vector variables; + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + }; + for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector{ + // You should not do this, but it works. + {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}}, + u"$privateUseOrUnassigned$close", + U_ZERO_ERROR, + u"[[:Co:][:Cn:]]"}, + // This works and it is fine. + {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"}, + // This should work! But it does not. Note the doubled brackets on the one that works above. + // We are not yet inside the variable when we call lookahead(), so we try to parse + // $privateUse rather than [:Co:]. + {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"}, + // This should not work, and it does not (we try to parse [$sad$surprised] as a + // property-query). + {{{u"sad", u":C"}, {u"surprised", u"o:"}}, + u"[$sad$surprised]", + U_ILLEGAL_ARGUMENT_ERROR, + u"[]"}, + }) { UErrorCode errorCode = U_ZERO_ERROR; TokenSymbolTable symbols(errorCode); if (U_FAILURE(errorCode)) { @@ -1899,20 +1907,26 @@ void UnicodeSetTest::TestLookupSymbolTable() { symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); + struct TestCase { + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + std::u16string_view expectedRegeneratedPattern; + }; for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector>{ - {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, - {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, - {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, - // Substitution of lookupMatcher symbols takes place after de-escaping. - {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, - // It does not take place in string literals. - {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, - {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, - {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", - u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, - {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, - }) { + std::vector{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + // Substitution of lookupMatcher symbols takes place after unescaping. + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + // It does not take place in string literals. + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, + {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + }) { UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); @@ -1942,33 +1956,32 @@ void UnicodeSetTest::TestLookupSymbolTable() { symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector< - std::tuple>{ - {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, - {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, - // The hyphen no longer works as set difference. - {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, - {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, - // String literals no longer work. - {uR"([!-/{0}])", U_ZERO_ERROR, - u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", - u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, - // The ampersand no longer works as set difference. - {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", - u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, - // Complementing still works. - {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])", - uR"([\u0001-\U0010FFFF])"}, - // ^ elsewhere becomes a symbol rather than a syntax error. - {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])", - uR"([\u0000{circumflexAccent}{hyphenMinus}])"}, - // Opening brackets still work. - {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"}, - // The only way to access the [ symbol is via escaping. - {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, - // Anchors are gone. - {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, - }) { + std::vector{ + {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + // The hyphen no longer works as set difference. + {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, + {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, + // String literals no longer work. + {uR"([!-/{0}])", U_ZERO_ERROR, + u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", + u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, + // The ampersand no longer works as set difference. + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", + u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, + // Complementing still works. + {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])", + uR"([\u0001-\U0010FFFF])"}, + // ^ elsewhere becomes a symbol rather than a syntax error. + {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])", + uR"([\u0000{circumflexAccent}{hyphenMinus}])"}, + // Opening brackets still work. + {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"}, + // The only way to access the [ symbol is via escaping. + {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, + // Anchors are gone. + {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, + }) { UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); From 83bf69b486f0a629ea4d06845e3aab5555c400de Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 14:23:32 +0200 Subject: [PATCH 30/56] ICU-23179 Test UnicodeSet with lookupMatcher --- icu4c/source/test/intltest/usettest.cpp | 216 +++++++++++++++++++++++- icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 216 insertions(+), 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..4d52c95d0e4a 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestEscapePattern); TESTCASE_AUTO(TestInvalidCodePoint); TESTCASE_AUTO(TestSymbolTable); + TESTCASE_AUTO(TestLookupSymbolTable); TESTCASE_AUTO(TestSurrogate); TESTCASE_AUTO(TestPosixClasses); TESTCASE_AUTO(TestIteration); @@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() { // Multiple test cases can be set up here. Each test case // is terminated by null: // var, value, var, value,..., input pat., exp. output pat., null - const char* DATA[] = { + const char *DATA[] = { "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr, "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr, "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr, + // Things that probably should not work, but currently do: + "open", "[", "$open a-z]", "[a-z]", nullptr, + "open", "[", "close", "]", "hyphenMinus", "-", + "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]", + "[[a-z]-[c-z]-]", nullptr, + "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr, + "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr, + "smiling", ":-]", "laughing", ":-D", + "[ {$smiling} $laughing $smiling", + R"([\-\:-D{\:\-\]}])", nullptr, nullptr }; @@ -1811,6 +1823,208 @@ void UnicodeSetTest::TestSymbolTable() { logln(UnicodeString("Ok, got ") + us.toPattern(a, true)); } } + struct TestCase { + struct Variable { + std::u16string_view name; + std::u16string_view value; + }; + std::vector variables; + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + }; + for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector{ + // You should not do this, but it works. + {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}}, + u"$privateUseOrUnassigned$close", + U_ZERO_ERROR, + u"[[:Co:][:Cn:]]"}, + // This works and it is fine. + {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"}, + // This should work! But it does not. Note the doubled brackets on the one that works above. + // We are not yet inside the variable when we call lookahead(), so we try to parse + // $privateUse rather than [:Co:]. + {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"}, + // This should not work, and it does not (we try to parse [$sad$surprised] as a + // property-query). + {{{u"sad", u":C"}, {u"surprised", u"o:"}}, + u"[$sad$surprised]", + U_ILLEGAL_ARGUMENT_ERROR, + u"[]"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + TokenSymbolTable symbols(errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t construct symbol table"); + continue; + } + for (const auto &[name, value] : variables) { + symbols.add(name, value, errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t add variable " + name); + continue; + } + } + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + UnicodeString actual; + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + } +} + +void UnicodeSetTest::TestLookupSymbolTable() { + UErrorCode errorCode = U_ZERO_ERROR; + class TestSymbolTable : public SymbolTable { + public: + const UnicodeString *lookup(const UnicodeString &) const override { + return nullptr; + } + + const UnicodeFunctor *lookupMatcher(UChar32 c) const override { + return symbols_.find(c) != symbols_.end() ? &symbols_.at(c) + : nullptr; + } + + virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &, + int32_t) const override { + return u""; + } + + void add(UChar32 c, UnicodeSet set) { + symbols_[c] = set; + } + + private: + std::unordered_map symbols_; + }; + TestSymbolTable symbols; + symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); + symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); + symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); + struct TestCase { + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + std::u16string_view expectedRegeneratedPattern; + }; + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + // Substitution of lookupMatcher symbols takes place after unescaping. + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + // It does not take place in string literals. + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, + {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea + // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not + // change it unknowingly. + symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode)); + symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode)); + // This one is never used, except if escaped. + symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode)); + symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode)); + symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); + symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); + symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector{ + {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + // The hyphen no longer works as set difference. + {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, + {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, + // String literals no longer work. + {uR"([!-/{0}])", U_ZERO_ERROR, + u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", + u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, + // The ampersand no longer works as set difference. + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", + u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, + // Complementing still works. + {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])", + uR"([\u0001-\U0010FFFF])"}, + // ^ elsewhere becomes a symbol rather than a syntax error. + {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])", + uR"([\u0000{circumflexAccent}{hyphenMinus}])"}, + // Opening brackets still work. + {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"}, + // The only way to access the [ symbol is via escaping. + {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, + // Anchors are gone. + {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the + // constructor returns an error but not an empty set. Don’t do that. + symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector< + std::tuple>{ + {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } } void UnicodeSetTest::TestSurrogate() { diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..32abf828a30a 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest { void TestInvalidCodePoint(); void TestSymbolTable(); + void TestLookupSymbolTable(); void TestSurrogate(); From ed395a63ccd64c7cfe65f21da4495385a0ba8d02 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 14:27:08 +0200 Subject: [PATCH 31/56] meow --- icu4c/source/test/intltest/usettest.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 4d52c95d0e4a..a5c249d4da37 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -2003,11 +2003,10 @@ void UnicodeSetTest::TestLookupSymbolTable() { // constructor returns an error but not an empty set. Don’t do that. symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector< - std::tuple>{ - {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, - {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, - }) { + std::vector{ + {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + }) { UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); From 3a4ab4575839f94200b9b2287a73d69065282933 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 14:23:32 +0200 Subject: [PATCH 32/56] ICU-23179 Test UnicodeSet with lookupMatcher --- icu4c/source/test/intltest/usettest.cpp | 215 +++++++++++++++++++++++- icu4c/source/test/intltest/usettest.h | 1 + 2 files changed, 215 insertions(+), 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 016d3f85e63d..a5c249d4da37 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -93,6 +94,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, TESTCASE_AUTO(TestEscapePattern); TESTCASE_AUTO(TestInvalidCodePoint); TESTCASE_AUTO(TestSymbolTable); + TESTCASE_AUTO(TestLookupSymbolTable); TESTCASE_AUTO(TestSurrogate); TESTCASE_AUTO(TestPosixClasses); TESTCASE_AUTO(TestIteration); @@ -1753,10 +1755,20 @@ void UnicodeSetTest::TestSymbolTable() { // Multiple test cases can be set up here. Each test case // is terminated by null: // var, value, var, value,..., input pat., exp. output pat., null - const char* DATA[] = { + const char *DATA[] = { "us", "a-z", "[0-1$us]", "[0-1a-z]", nullptr, "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", nullptr, "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", nullptr, + // Things that probably should not work, but currently do: + "open", "[", "$open a-z]", "[a-z]", nullptr, + "open", "[", "close", "]", "hyphenMinus", "-", + "[ $open a $hyphenMinus z] $hyphenMinus [ c-z $close $hyphenMinus ]", + "[[a-z]-[c-z]-]", nullptr, + "string", "{", "end", "}", "[ $string Zeichenkette $end ]", "[{Zeichenkette}]", nullptr, + "privateUse", "[[:Co:]]", "$privateUse", "[[:Co:]]", nullptr, + "smiling", ":-]", "laughing", ":-D", + "[ {$smiling} $laughing $smiling", + R"([\-\:-D{\:\-\]}])", nullptr, nullptr }; @@ -1811,6 +1823,207 @@ void UnicodeSetTest::TestSymbolTable() { logln(UnicodeString("Ok, got ") + us.toPattern(a, true)); } } + struct TestCase { + struct Variable { + std::u16string_view name; + std::u16string_view value; + }; + std::vector variables; + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + }; + for (const auto &[variables, expression, expectedErrorCode, expectedPattern] : std::vector{ + // You should not do this, but it works. + {{{u"privateUseOrUnassigned", u"[[:Co:][:Cn:]"}, {u"close", u"]"}}, + u"$privateUseOrUnassigned$close", + U_ZERO_ERROR, + u"[[:Co:][:Cn:]]"}, + // This works and it is fine. + {{{u"privateUse", u"[[:Co:]]"}}, u"$privateUse", U_ZERO_ERROR, u"[[:Co:]]"}, + // This should work! But it does not. Note the doubled brackets on the one that works above. + // We are not yet inside the variable when we call lookahead(), so we try to parse + // $privateUse rather than [:Co:]. + {{{u"privateUse", u"[:Co:]"}}, u"[$privateUse]", U_ILLEGAL_ARGUMENT_ERROR, u"[]"}, + // This should not work, and it does not (we try to parse [$sad$surprised] as a + // property-query). + {{{u"sad", u":C"}, {u"surprised", u"o:"}}, + u"[$sad$surprised]", + U_ILLEGAL_ARGUMENT_ERROR, + u"[]"}, + }) { + UErrorCode errorCode = U_ZERO_ERROR; + TokenSymbolTable symbols(errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t construct symbol table"); + continue; + } + for (const auto &[name, value] : variables) { + symbols.add(name, value, errorCode); + if (U_FAILURE(errorCode)) { + errln("FAIL: Couldn’t add variable " + name); + continue; + } + } + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + UnicodeString actual; + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + } +} + +void UnicodeSetTest::TestLookupSymbolTable() { + UErrorCode errorCode = U_ZERO_ERROR; + class TestSymbolTable : public SymbolTable { + public: + const UnicodeString *lookup(const UnicodeString &) const override { + return nullptr; + } + + const UnicodeFunctor *lookupMatcher(UChar32 c) const override { + return symbols_.find(c) != symbols_.end() ? &symbols_.at(c) + : nullptr; + } + + virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &, + int32_t) const override { + return u""; + } + + void add(UChar32 c, UnicodeSet set) { + symbols_[c] = set; + } + + private: + std::unordered_map symbols_; + }; + TestSymbolTable symbols; + symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); + symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); + symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); + struct TestCase { + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + std::u16string_view expectedRegeneratedPattern; + }; + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + // Substitution of lookupMatcher symbols takes place after unescaping. + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + // It does not take place in string literals. + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, + {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea + // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not + // change it unknowingly. + symbols.add(u'-', UnicodeSet(u"[{hyphenMinus}]", errorCode)); + symbols.add(u'&', UnicodeSet(u"[{ampersand}]", errorCode)); + // This one is never used, except if escaped. + symbols.add(u'[', UnicodeSet(u"[{leftSquareBracket}]", errorCode)); + symbols.add(u'^', UnicodeSet(u"[{circumflexAccent}]", errorCode)); + symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); + symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); + symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector{ + {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, + // The hyphen no longer works as set difference. + {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, + {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, + // String literals no longer work. + {uR"([!-/{0}])", U_ZERO_ERROR, + u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", + u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, + // The ampersand no longer works as set difference. + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", + u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, + // Complementing still works. + {uR"([^ \u0000 ])", U_ZERO_ERROR, uR"([\u0001-\U0010FFFF])", + uR"([\u0001-\U0010FFFF])"}, + // ^ elsewhere becomes a symbol rather than a syntax error. + {uR"([\u0000 ^ -])", U_ZERO_ERROR, uR"([\u0000[{circumflexAccent}][{hyphenMinus}]])", + uR"([\u0000{circumflexAccent}{hyphenMinus}])"}, + // Opening brackets still work. + {uR"([^ [ [^] ] ])", U_ZERO_ERROR, uR"([^[[\u0000-\U0010FFFF]]])", uR"([])"}, + // The only way to access the [ symbol is via escaping. + {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, + // Anchors are gone. + {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } + // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the + // constructor returns an error but not an empty set. Don’t do that. + symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : + std::vector{ + {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + }) { + UnicodeString actual; + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); + if (errorCode != expectedErrorCode) { + errln(u"Parsing " + expression + u": Expected " + u_errorName(expectedErrorCode) + ", got " + + u_errorName(errorCode)); + } + if (set.toPattern(actual) != expectedPattern) { + errln(u"UnicodeSet(R\"(" + expression + u")\").toPattern() expected " + expectedPattern + + ", got " + actual); + } + if (UnicodeSet(set).complement().complement().toPattern(actual) != expectedRegeneratedPattern) { + errln(u"UnicodeSet(R\"(" + expression + + u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + + ", got " + actual); + } + } } void UnicodeSetTest::TestSurrogate() { diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 2ac22ba72e62..32abf828a30a 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -84,6 +84,7 @@ class UnicodeSetTest: public IntlTest { void TestInvalidCodePoint(); void TestSymbolTable(); + void TestLookupSymbolTable(); void TestSurrogate(); From d5e73a8a62a7525777b60160e2982b7643936166 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 18:11:33 +0200 Subject: [PATCH 33/56] ICU-23179 Test the exact sequence of lookups --- icu4c/source/test/intltest/usettest.cpp | 136 +++++++++++++++++++----- 1 file changed, 110 insertions(+), 26 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index a5c249d4da37..5d1cf77247b7 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -29,9 +30,11 @@ #include "unicode/symtable.h" #include "unicode/utf8.h" #include "unicode/utf16.h" +#include "unicode/utfiterator.h" #include "unicode/uversion.h" #include "cmemory.h" #include "hash.h" +#include #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \ if (U_FAILURE(status)) { \ @@ -1880,53 +1883,119 @@ void UnicodeSetTest::TestSymbolTable() { void UnicodeSetTest::TestLookupSymbolTable() { UErrorCode errorCode = U_ZERO_ERROR; + struct TestCase { + struct Variable { + std::u16string_view name; + std::u16string_view value; + }; + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + std::u16string_view expectedRegeneratedPattern; + // Hyrum’s law at work: Some users (RBBI) depend on the sequencing of `lookup` and + // `lookupMatcher` calls, so we test that. + std::vector> expectedLookups; + // Variables for `lookup`. + std::vector variables; + }; class TestSymbolTable : public SymbolTable { public: - const UnicodeString *lookup(const UnicodeString &) const override { - return nullptr; + const UnicodeString *lookup(const UnicodeString &name) const override { + auto it = variables_.find(name); + lookupTrace_.push_back(name); + return it == variables_.end() ? nullptr : &it->second; } const UnicodeFunctor *lookupMatcher(UChar32 c) const override { + lookupTrace_.push_back(c); return symbols_.find(c) != symbols_.end() ? &symbols_.at(c) : nullptr; } - virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &, - int32_t) const override { - return u""; + virtual UnicodeString parseReference(const UnicodeString &text, ParsePosition &pos, + int32_t limit) const override { + const auto limitedText = std::u16string_view(text).substr(pos.getIndex(), limit); + for (auto codeUnits : header::utfStringCodePoints(limitedText)) { + if (!u_isIDPart(codeUnits.codePoint())) { + pos.setIndex(pos.getIndex() + (codeUnits.begin() - limitedText.begin())); + // TODO(egg): In C++20, this could use the two-iterator constructor of + // std::u16string_view. + return limitedText.substr(0, codeUnits.begin() - limitedText.begin()); + } + } + pos.setIndex(limit); + return limitedText; } void add(UChar32 c, UnicodeSet set) { symbols_[c] = set; } + void setVariables(const std::vector& variables) { + for (const auto &[name, value] : variables) { + variables_[name] = value; + } + } + + const std::vector>& getLookupTrace() const { + return lookupTrace_; + } + + void clearLookupTrace() { + lookupTrace_.clear(); + } + private: std::unordered_map symbols_; + std::map variables_; + mutable std::vector> lookupTrace_; }; TestSymbolTable symbols; symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); - struct TestCase { - std::u16string_view expression; - UErrorCode expectedErrorCode; - std::u16string_view expectedPattern; - std::u16string_view expectedRegeneratedPattern; - }; - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ - {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, - {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, - {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}}, + // A call to lookupMatcher with the first character of the content of a variable happens + // immediately after a corresponding call to lookup, although we may lookup the variable + // several times before we call lookupMatcher. + {u"[0-$one]", + U_ZERO_ERROR, + u"[[a-z]-[bc]]", + u"[ad-z]", + {u'0', u'-', u"one", u"one", u'1', u']'}, + {{u"zero", u"0"}, {u"one", u"1"}}}, + {u"[$zero-$one]", + U_ZERO_ERROR, + u"[[a-z]-[bc]]", + u"[ad-z]", + {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'}, + {{u"zero", u"0"}, {u"one", u"1"}}}, + // If the variable expands to multiple symbols, only the first one is sequenced right after + // the variable lookup. + {u"[$ten]", + U_ZERO_ERROR, + u"[[bc][a-z]]", + u"[a-z]", + {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'}, + {{u"ten", u"10"}}}, // Substitution of lookupMatcher symbols takes place after unescaping. - {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}}, // It does not take place in string literals. - {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, - {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, - {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", - u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, - {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}}, + {uR"([ 21 ])", + U_ZERO_ERROR, + u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]", + {u'2', u'1', u']'}}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}}, }) { + symbols.setVariables(variables); + symbols.clearLookupTrace(); UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); @@ -1943,6 +2012,21 @@ void UnicodeSetTest::TestLookupSymbolTable() { u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + ", got " + actual); } + if (symbols.getLookupTrace() != expectedLookups) { + UnicodeString expected; + UnicodeString actual; + for (const auto &l : expectedLookups) { + expected += std::holds_alternative(l) + ? (u"u'" + UnicodeString(std::get(l)) + u"', ") + : u"u\"" + std::get(l) + u"\", "; + } + for (const auto &l : symbols.getLookupTrace()) { + actual += std::holds_alternative(l) + ? (u"u'" + UnicodeString(std::get(l)) + u"', ") + : u"u\"" + std::get(l) + u"\", "; + } + errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual : " + actual); + } } // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not @@ -1955,8 +2039,8 @@ void UnicodeSetTest::TestLookupSymbolTable() { symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, // The hyphen no longer works as set difference. @@ -2002,8 +2086,8 @@ void UnicodeSetTest::TestLookupSymbolTable() { // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the // constructor returns an error but not an empty set. Don’t do that. symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, }) { From ef59acb8ef05a30e4b4e3396b59d53f413ca58e0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 18 Aug 2025 18:11:33 +0200 Subject: [PATCH 34/56] ICU-23179 Test the exact sequence of lookups --- icu4c/source/test/intltest/usettest.cpp | 140 +++++++++++++++++++----- 1 file changed, 114 insertions(+), 26 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index a5c249d4da37..6b5fba510286 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -29,9 +30,11 @@ #include "unicode/symtable.h" #include "unicode/utf8.h" #include "unicode/utf16.h" +#include "unicode/utfiterator.h" #include "unicode/uversion.h" #include "cmemory.h" #include "hash.h" +#include #define TEST_ASSERT_SUCCESS(status) UPRV_BLOCK_MACRO_BEGIN { \ if (U_FAILURE(status)) { \ @@ -1880,53 +1883,122 @@ void UnicodeSetTest::TestSymbolTable() { void UnicodeSetTest::TestLookupSymbolTable() { UErrorCode errorCode = U_ZERO_ERROR; + // We let `variables` be empty by default in the test cases below. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" + struct TestCase { + struct Variable { + std::u16string_view name; + std::u16string_view value; + }; + std::u16string_view expression; + UErrorCode expectedErrorCode; + std::u16string_view expectedPattern; + std::u16string_view expectedRegeneratedPattern; + // Hyrum’s law at work: Some users (RBBI) depend on the sequencing of `lookup` and + // `lookupMatcher` calls, so we test that. + std::vector> expectedLookups; + // Variables for `lookup`. + std::vector variables; + }; class TestSymbolTable : public SymbolTable { public: - const UnicodeString *lookup(const UnicodeString &) const override { - return nullptr; + const UnicodeString *lookup(const UnicodeString &name) const override { + auto it = variables_.find(name); + lookupTrace_.push_back(name); + return it == variables_.end() ? nullptr : &it->second; } const UnicodeFunctor *lookupMatcher(UChar32 c) const override { + lookupTrace_.push_back(c); return symbols_.find(c) != symbols_.end() ? &symbols_.at(c) : nullptr; } - virtual UnicodeString parseReference(const UnicodeString &, ParsePosition &, - int32_t) const override { - return u""; + virtual UnicodeString parseReference(const UnicodeString &text, ParsePosition &pos, + int32_t limit) const override { + const auto limitedText = std::u16string_view(text).substr(pos.getIndex(), limit); + for (auto codeUnits : header::utfStringCodePoints(limitedText)) { + if (!u_isIDPart(codeUnits.codePoint())) { + pos.setIndex(pos.getIndex() + (codeUnits.begin() - limitedText.begin())); + // TODO(egg): In C++20, this could use the two-iterator constructor of + // std::u16string_view. + return limitedText.substr(0, codeUnits.begin() - limitedText.begin()); + } + } + pos.setIndex(limit); + return limitedText; } void add(UChar32 c, UnicodeSet set) { symbols_[c] = set; } + void setVariables(const std::vector& variables) { + for (const auto &[name, value] : variables) { + variables_[name] = value; + } + } + + const std::vector>& getLookupTrace() const { + return lookupTrace_; + } + + void clearLookupTrace() { + lookupTrace_.clear(); + } + private: std::unordered_map symbols_; + std::map variables_; + mutable std::vector> lookupTrace_; }; TestSymbolTable symbols; symbols.add(u'0', UnicodeSet(u"[ a-z ]", errorCode)); symbols.add(u'1', UnicodeSet(u"[ b-c ]", errorCode)); symbols.add(u'2', UnicodeSet(u"[: Co :]", errorCode)); - struct TestCase { - std::u16string_view expression; - UErrorCode expectedErrorCode; - std::u16string_view expectedPattern; - std::u16string_view expectedRegeneratedPattern; - }; - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ - {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, - {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]"}, - {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]"}, + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ + {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]", {u'0'}}, + {u"[0-1]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", {u'0', u'-', u'1', u']'}}, + {u"[!-0]", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}}, + // A call to lookupMatcher with the first character of the content of a variable happens + // immediately after a corresponding call to lookup, although we may lookup the variable + // several times before we call lookupMatcher. + {u"[0-$one]", + U_ZERO_ERROR, + u"[[a-z]-[bc]]", + u"[ad-z]", + {u'0', u'-', u"one", u"one", u'1', u']'}, + {{u"zero", u"0"}, {u"one", u"1"}}}, + {u"[$zero-$one]", + U_ZERO_ERROR, + u"[[a-z]-[bc]]", + u"[ad-z]", + {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'}, + {{u"zero", u"0"}, {u"one", u"1"}}}, + // If the variable expands to multiple symbols, only the first one is sequenced right after + // the variable lookup. + {u"[$ten]", + U_ZERO_ERROR, + u"[[bc][a-z]]", + u"[a-z]", + {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'}, + {{u"ten", u"10"}}}, // Substitution of lookupMatcher symbols takes place after unescaping. - {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]"}, + {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}}, // It does not take place in string literals. - {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]"}, - {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]"}, - {uR"([ 21 ])", U_ZERO_ERROR, u"[[: Co :][bc]]", - u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, - {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]"}, + {uR"([!-/{0}])", U_ZERO_ERROR, u"[!-0]", u"[!-0]", {u'!', u'-', u'/', u'{', u']'}}, + {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :]&[bc]]", u"[]", {u'2', u'&', u'1', u']'}}, + {uR"([ 21 ])", + U_ZERO_ERROR, + u"[[: Co :][bc]]", + u"[bc\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]", + {u'2', u'1', u']'}}, + {u"[ a-b 1 ]", U_ZERO_ERROR, u"[a-b[bc]]", u"[a-c]", {u'a', u'-', u'b', u'1', u']'}}, }) { + symbols.setVariables(variables); + symbols.clearLookupTrace(); UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, USET_IGNORE_SPACE, &symbols, errorCode); @@ -1943,6 +2015,21 @@ void UnicodeSetTest::TestLookupSymbolTable() { u")\").complement().complement().toPattern() expected " + expectedRegeneratedPattern + ", got " + actual); } + if (symbols.getLookupTrace() != expectedLookups) { + UnicodeString expected; + UnicodeString actual; + for (const auto &l : expectedLookups) { + expected += std::holds_alternative(l) + ? (u"u'" + UnicodeString(std::get(l)) + u"', ") + : u"u\"" + std::get(l) + u"\", "; + } + for (const auto &l : symbols.getLookupTrace()) { + actual += std::holds_alternative(l) + ? (u"u'" + UnicodeString(std::get(l)) + u"', ") + : u"u\"" + std::get(l) + u"\", "; + } + errln(u"Unexpected sequence of lookups:\nExpected : " + expected + "\nActual : " + actual); + } } // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not @@ -1955,8 +2042,8 @@ void UnicodeSetTest::TestLookupSymbolTable() { symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, {u"0", U_ZERO_ERROR, u"[a-z]", u"[a-z]"}, // The hyphen no longer works as set difference. @@ -2002,8 +2089,8 @@ void UnicodeSetTest::TestLookupSymbolTable() { // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the // constructor returns an error but not an empty set. Don’t do that. symbols.add(u']', UnicodeSet(u"[{rightSquareBracket}]", errorCode)); - for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern] : - std::vector{ + for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, + expectedLookups, variables] : std::vector{ {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, }) { @@ -2024,6 +2111,7 @@ void UnicodeSetTest::TestLookupSymbolTable() { ", got " + actual); } } +#pragma GCC diagnostic pop } void UnicodeSetTest::TestSurrogate() { From 770c9aa9f40a25235b988180ae543e1d15123a53 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 20 Aug 2025 16:06:34 +0200 Subject: [PATCH 35/56] Ignore warnings --- icu4c/source/test/intltest/usettest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 5d1cf77247b7..6b5fba510286 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -1883,6 +1883,9 @@ void UnicodeSetTest::TestSymbolTable() { void UnicodeSetTest::TestLookupSymbolTable() { UErrorCode errorCode = U_ZERO_ERROR; + // We let `variables` be empty by default in the test cases below. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" struct TestCase { struct Variable { std::u16string_view name; @@ -2108,6 +2111,7 @@ void UnicodeSetTest::TestLookupSymbolTable() { ", got " + actual); } } +#pragma GCC diagnostic pop } void UnicodeSetTest::TestSurrogate() { From 110a54d78b0a0bd1d2528556a0606bf7aec6285f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 21 Aug 2025 17:15:25 +0200 Subject: [PATCH 36/56] Abstract away the getPos/next/setPos/lookupMatcher dance --- icu4c/source/common/unicode/uniset.h | 26 +- icu4c/source/common/uniset_closure.cpp | 2 +- icu4c/source/common/uniset_props.cpp | 430 ++++++++++++++----------- 3 files changed, 255 insertions(+), 203 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 2d73df2fcdac..d805fd9e8156 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1697,6 +1697,7 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { UErrorCode& status); void applyPattern(const UnicodeString &pattern, + const ParsePosition& parsePosition, RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, @@ -1709,18 +1710,16 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { // applied). They add to *this the elements of the set that the parsed construct represents. // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. - void parseUnicodeSet(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, + class Lexer; + + void parseUnicodeSet(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec); - void parseUnion(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, + void parseUnion(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), @@ -1728,30 +1727,23 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { bool &containsRestrictions, UErrorCode &ec); - void parseTerm(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, + void parseTerm(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, - bool &containsRestriction, + bool &containsRestrictions, UErrorCode &ec); - void parseRestriction(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, + void parseRestriction(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec); - void parseElements(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, + void parseElements(Lexer &lexer, UnicodeString &rebuiltPat, - uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec); diff --git a/icu4c/source/common/uniset_closure.cpp b/icu4c/source/common/uniset_closure.cpp index 05e9b0a37e04..2cd3e01ee324 100644 --- a/icu4c/source/common/uniset_closure.cpp +++ b/icu4c/source/common/uniset_closure.cpp @@ -101,7 +101,7 @@ UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(pattern, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); + applyPattern(pattern, pos, chars, symbols, rebuiltPat, options, &UnicodeSet::closeOver, status); if (U_FAILURE(status)) return *this; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 8c4b13f18e71..46401a273b4e 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -44,6 +44,7 @@ #include "umutex.h" #include "uassert.h" #include "hash.h" +#include U_NAMESPACE_USE @@ -196,7 +197,7 @@ UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, // _applyPattern calls add() etc., which set pat to empty. UnicodeString rebuiltPat; RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(pattern, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); + applyPattern(pattern, pos, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); if (U_FAILURE(status)) return; if (chars.inVariable()) { // syntaxError(chars, "Extra chars in variable value"); @@ -220,42 +221,164 @@ UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { // Implementation: Pattern parsing //---------------------------------------------------------------- -namespace { +class UnicodeSet::Lexer { + public: + Lexer(const UnicodeString &pattern, + const ParsePosition &parsePosition, + RuleCharacterIterator &chars, + uint32_t unicodeSetOptions, + const SymbolTable *const symbols) + : pattern_(pattern), parsePosition_(parsePosition), chars_(chars), + charsOptions_(RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES | + ((unicodeSetOptions & USET_IGNORE_SPACE) != 0 + ? RuleCharacterIterator::SKIP_WHITESPACE + : 0)), + symbols_(symbols) {} + + class Lookahead { + public: + bool isUnescaped(UChar32 codePoint) const { + return !escaped_ && codePoint_ == codePoint; + } -/** - * A small all-inline class to manage a UnicodeSet pointer. Add - * operator->() etc. as needed. - */ -class UnicodeSetPointer { - UnicodeSet* p; -public: - inline UnicodeSetPointer() : p(nullptr) {} - inline ~UnicodeSetPointer() { delete p; } - inline UnicodeSet* pointer() { return p; } - inline UBool allocate() { - if (p == nullptr) { - p = new UnicodeSet(); + bool isUnescapedNotStandIn(UChar32 codePoint) { + return isUnescaped(codePoint) && standIn() == nullptr; + } + + void moveAfter() { + lexer_.chars_.setPos(after_); + lexer_.ahead_.reset(); + } + + bool acceptUnescapedNotStandIn(UChar32 codePoint) { + if (isUnescapedNotStandIn(codePoint)) { + moveAfter(); + return true; + } + return false; + } + + bool acceptUnescaped(UChar32 codePoint) { + if (isUnescaped(codePoint)) { + moveAfter(); + return true; + } + return false; + } + + UChar32 codePoint(UErrorCode &errorCode) const { + if (!U_FAILURE(errorCode)) { + errorCode = errorCode; + } + return codePoint_; + } + + bool escaped() const { + return escaped_; } - return p != nullptr; + + const UnicodeSet *standIn() { + if (!standIn_.has_value()) { + if (lexer_.symbols_ == nullptr) { + standIn_ = nullptr; + } else { + standIn_ = + dynamic_cast(lexer_.symbols_->lookupMatcher(codePoint_)); + } + } + return *standIn_; + }; + + // Some parts of the grammar need two tokens of lookahead. The second lookahead is not cached. + Lookahead oneMore() { + return oneMore(lexer_.charsOptions_); + } + + Lookahead oneMore(int32_t charsOptions) { + RuleCharacterIterator::Pos before; + lexer_.chars_.getPos(before); + lexer_.chars_.setPos(after_); + auto const result = Lookahead(lexer_, lexer_.chars_, charsOptions); + lexer_.chars_.setPos(before); + return result; + } + + Lookahead(Lexer &lexer, RuleCharacterIterator &chars, int32_t charsOptions) + : lexer_(lexer) { + RuleCharacterIterator::Pos before; + chars.getPos(before); + codePoint_ = chars.next(charsOptions, escaped_, errorCode_); + chars.getPos(after_); + chars.setPos(before); + } + + private: + Lexer &lexer_; + RuleCharacterIterator::Pos after_; + UErrorCode errorCode_; + UChar32 codePoint_; + UBool escaped_; + // `std::nullopt` if we have not yet called `lookupMatcher`, otherwise the result of + // `lookupMatcher` (which may be `nullptr`). + std::optional standIn_; + + friend class Lexer; + }; + + UnicodeString getPositionForDebugging() const { + return pattern_.tempSubString(0, parsePosition_.getIndex()) + u"☞" + + pattern_.tempSubString(parsePosition_.getIndex(), 60); } -}; -constexpr int32_t MAX_DEPTH = 100; + Lookahead &lookahead() { + if (!ahead_.has_value()) { + ahead_.emplace(*this, chars_, charsOptions_); + } + return *ahead_; + } -constexpr uint32_t charsOptions(const uint32_t unicodeSetOptions) { - int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | RuleCharacterIterator::PARSE_ESCAPES; - if ((unicodeSetOptions & USET_IGNORE_SPACE) != 0) { - opts |= RuleCharacterIterator::SKIP_WHITESPACE; + bool resemblesPropertyPattern() { + Lookahead first = + Lookahead(*this, chars_, charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES); + if (first.codePoint_ != u'[' && first.codePoint_ != u'\\') { + return false; + } + Lookahead second = first.oneMore(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE)); + return (first.codePoint_ == u'[' && second.codePoint_ == ':') || + (first.codePoint_ == u'\\' && + (second.codePoint_ == u'p' || second.codePoint_ == u'P' || second.codePoint_ == u'N')); } - return opts; -} -const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar32 c) { - if (symbols == nullptr) { - return nullptr; + // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so + // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to + // `lookahead`. The resulting `RuleCharacterIterator` must not be used once `lookahead` has been + // called. + RuleCharacterIterator &getCharacterIterator() { + ahead_.reset(); + return chars_; } - return dynamic_cast(symbols->lookupMatcher(c)); -} + + int32_t charsOptions() { + return charsOptions_; + } + + bool atEnd() const { + return chars_.atEnd(); + } + + private: + const UnicodeString &pattern_; + const ParsePosition &parsePosition_; + RuleCharacterIterator &chars_; + const int32_t charsOptions_; + const SymbolTable *const symbols_; + std::optional ahead_; +}; + +namespace { + +constexpr int32_t MAX_DEPTH = 100; #if 0 #define U_UNICODESET_TRACE(...) \ @@ -314,24 +437,16 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar return; \ } \ } while (false) -#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, chars, ec) \ +#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec) \ do { \ - constexpr std::string_view functionName = __func__; \ - static_assert(functionName.substr(0, 5) == "parse");\ + constexpr std::string_view functionName = __func__; \ + static_assert(functionName.substr(0, 5) == "parse"); \ std::string actualUTF8; \ - UnicodeString ahead; \ - std::string aheadUTF8; \ - std::string behindUTF8; \ - (chars).lookahead(ahead); \ - printf("*** Expected %s, got '%s' %s☜%s\n", (expected), \ + std::string contextUTF8; \ + printf("*** Expected %s, got '%s' %s\n", (expected), \ UnicodeString(actual).toUTF8String(actualUTF8).c_str(), \ - pattern.tempSubString(0, pattern.length() - ahead.length()) \ - .toUTF8String(behindUTF8) \ - .c_str(), \ - pattern.tempSubString(pattern.length() - ahead.length(), 60) \ - .toUTF8String(aheadUTF8) \ - .c_str()); \ - printf("--- in %s l. %d\n", __func__ + 5, __LINE__); \ + lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str()); \ + printf("--- in %s l. %d\n", __func__ + 5, __LINE__); \ (ec) = U_MALFORMED_SET; \ return; \ } while (false) @@ -342,6 +457,7 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar * Parse the pattern from the given RuleCharacterIterator. The * iterator is advanced over the parsed pattern. * @param pattern The pattern, only used by debug traces. + * @param parsePosition The ParsePosition underlying chars, only used by debug traces. * @param chars iterator over the pattern characters. Upon return * it will be advanced to the first character after the parsed * pattern, or the end of the iteration if all characters are @@ -355,6 +471,7 @@ const UnicodeSet *getMatcherSymbol(const SymbolTable *const symbols, const UChar */ void UnicodeSet::applyPattern(const UnicodeString &pattern, + const ParsePosition &parsePosition, RuleCharacterIterator &chars, const SymbolTable *symbols, UnicodeString &rebuiltPat, @@ -362,22 +479,22 @@ void UnicodeSet::applyPattern(const UnicodeString &pattern, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode &ec) { if (U_FAILURE(ec)) return; - parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, /*depth=*/0, ec); + Lexer lexer(pattern, parsePosition, chars, options, symbols); + parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, /*depth=*/0, ec); } -void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable* symbols, +void UnicodeSet::parseUnicodeSet(Lexer &lexer, UnicodeString& rebuiltPat, uint32_t options, UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, UErrorCode &ec) { + int32_t depth, + UErrorCode &ec) { clear(); U_UNICODESET_TRACE(); if (depth > MAX_DEPTH) { U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(), - ("depth = " + std::to_string(depth)).c_str(), chars, ec); + ("depth = " + std::to_string(depth)).c_str(), lexer, ec); } bool isComplement = false; @@ -388,17 +505,16 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, bool preserveSyntaxInPattern = false; // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc. UnicodeString prettyPrintedPattern; - if (resemblesPropertyPattern(chars, charsOptions(options))) { + if (lexer.resemblesPropertyPattern()) { // UnicodeSet ::= property-query | named-element U_UNICODESET_TRACE("property-query | named-element"); - chars.skipIgnored(charsOptions(options)); + lexer.getCharacterIterator().skipIgnored(lexer.charsOptions()); UnicodeSet propertyQuery; - propertyQuery.applyPropertyPattern(chars, prettyPrintedPattern, ec); + propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); addAll(propertyQuery); preserveSyntaxInPattern = true; } else { - UBool escaped = false; // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. // UnicodeSet ::= [ Union ] // | Complement ::= [ ^ Union ] @@ -407,37 +523,29 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, // Where a MatcherSymbol may be a character or an escape. // Strings that would match MatcherSymbol effectively get removed from // all other terminals of the grammar, except [. - UChar32 c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'[') { + if (lexer.lookahead().acceptUnescaped(u'[')) { prettyPrintedPattern.append(u'['); - RuleCharacterIterator::Pos afterBracket; - chars.getPos(afterBracket); - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'^') { + if (lexer.lookahead().acceptUnescaped(u'^')) { prettyPrintedPattern.append(u'^'); isComplement = true; - } else { - chars.setPos(afterBracket); } - parseUnion(pattern, chars, symbols, prettyPrintedPattern, options, caseClosure, depth, + parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth, /*containsRestrictions=*/preserveSyntaxInPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (escaped || c != u']') { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", c, chars, ec); + if (!lexer.lookahead().acceptUnescaped(u']')) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec); } prettyPrintedPattern.append(u']'); } else { - const UnicodeSet *set = getMatcherSymbol(symbols, c); + const UnicodeSet *set = lexer.lookahead().standIn(); if (set != nullptr) { *this = *set; this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false); + lexer.lookahead().moveAfter(); return; } - U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", c, chars, ec); + U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", + lexer.lookahead().codePoint(ec), lexer, ec); } } @@ -460,9 +568,7 @@ void UnicodeSet::parseUnicodeSet(const UnicodeString &pattern, } } -void UnicodeSet::parseUnion(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, +void UnicodeSet::parseUnion(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), @@ -470,64 +576,47 @@ void UnicodeSet::parseUnion(const UnicodeString &pattern, bool &containsRestrictions, UErrorCode &ec) { U_UNICODESET_TRACE(); - UBool escaped = false; - RuleCharacterIterator::Pos position; - chars.getPos(position); // Union ::= Terms // | UnescapedHyphenMinus Terms // | Terms UnescapedHyphenMinus // | UnescapedHyphenMinus Terms UnescapedHyphenMinus // Terms ::= "" // | Terms Term - UChar32 c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'-' && getMatcherSymbol(symbols, c)) { + if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) { add(u'-'); // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a // final one, for consistency with older ICU behaviour. rebuiltPat.append(u"\\-"); - } else { - chars.setPos(position); } - while (!chars.atEnd()) { - chars.getPos(position); - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (getMatcherSymbol(symbols, c) == nullptr) { - if (!escaped && c == u'-') { - // We can be here on the first iteration: [--] is allowed by the - // grammar and by the old parser. - rebuiltPat.append(u'-'); - add(u'-'); + while (!lexer.atEnd()) { + if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) { + // We can be here on the first iteration: [--] is allowed by the + // grammar and by the old parser. + rebuiltPat.append(u'-'); + add(u'-'); + return; + } else if (lexer.lookahead().isUnescapedNotStandIn(u'$')) { + Lexer::Lookahead afterDollar = lexer.lookahead().oneMore(); + if (afterDollar.isUnescaped(u']')) { + // ICU extensions: A $ is allowed as a literal-element. + // A Term at the end of a Union consisting of a single $ is an anchor. + rebuiltPat.append(u'$'); + // Consume the dollar. + lexer.lookahead().moveAfter(); + add(U_ETHER); + containsRestrictions = true; return; - } else if (!escaped && c == u'$') { - RuleCharacterIterator::Pos afterDollar; - chars.getPos(afterDollar); - c = chars.next(charsOptions(options), escaped, ec); - if (!escaped && c == u']') { - // ICU extensions: A $ is allowed as a literal-element. - // A Term at the end of a Union consisting of a single $ is an anchor. - rebuiltPat.append(u'$'); - chars.setPos(afterDollar); - add(U_ETHER); - containsRestrictions = true; - return; - } } } - chars.setPos(position); - if (!escaped && c == ']' && getMatcherSymbol(symbols, c) == nullptr) { + if (lexer.lookahead().isUnescapedNotStandIn(u']')) { return; } - parseTerm(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, containsRestrictions, - ec); + parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } -void UnicodeSet::parseTerm(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, +void UnicodeSet::parseTerm(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), @@ -535,40 +624,32 @@ void UnicodeSet::parseTerm(const UnicodeString &pattern, bool &containsRestriction, UErrorCode &ec) { U_UNICODESET_TRACE(); - UBool escaped = false; - RuleCharacterIterator::Pos termStart; - chars.getPos(termStart); // Term ::= Elements // | Restriction - const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); - chars.setPos(termStart); - if (getMatcherSymbol(symbols, ahead) != nullptr || !escaped && ahead == '[' || - resemblesPropertyPattern(chars, charsOptions(options))) { + if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') || + lexer.resemblesPropertyPattern()) { containsRestriction = true; - parseRestriction(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } else { - parseElements(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth, ec); + parseElements(lexer, rebuiltPat, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } -void UnicodeSet::parseRestriction(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, +void UnicodeSet::parseRestriction(Lexer &lexer, UnicodeString &rebuiltPat, uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, UErrorCode &ec) { + int32_t depth, + UErrorCode &ec) { U_UNICODESET_TRACE(); - UBool escaped = false; // Restriction ::= UnicodeSet // | Intersection ::= Restriction & UnicodeSet // | Difference ::= Restriction - UnicodeSet // Start by parsing the first UnicodeSet. UnicodeSet leftHandSide; - leftHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, depth + 1, - ec); + leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); addAll(leftHandSide); U_UNICODESET_RETURN_IF_ERROR(ec); // Now keep looking for an operator that would continue the Restriction. @@ -576,55 +657,41 @@ void UnicodeSet::parseRestriction(const UnicodeString &pattern, // return. for (;;) { RuleCharacterIterator::Pos beforeOperator; - chars.getPos(beforeOperator); - const UChar32 op = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (getMatcherSymbol(symbols, op)) { + if (lexer.lookahead().standIn() != nullptr) { // Not an operator, end of the Restriction. - chars.setPos(beforeOperator); return; } - if (!escaped && op == u'&') { + if (lexer.lookahead().acceptUnescaped(u'&')) { // Intersection ::= Restriction & UnicodeSet rebuiltPat.append(u'&'); UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, - depth + 1, ec); + rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); retainAll(rightHandSide); - } else if (!escaped && op == u'-') { + } else if (lexer.lookahead().isUnescaped(u'-')) { // Here the grammar requires two tokens of lookahead to figure out whether the - the operator // of a Difference or an UnescapedHyphenMinus in the enclosing Union. - RuleCharacterIterator::Pos afterOperator; - chars.getPos(afterOperator); - const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && ahead == u']') { + if (lexer.lookahead().oneMore().isUnescaped(u']')) { // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it. - chars.setPos(beforeOperator); return; } - chars.setPos(afterOperator); + // Consume the hyphen-minus. + lexer.lookahead().moveAfter(); // Difference ::= Restriction - UnicodeSet rebuiltPat.append(u'-'); UnicodeSet rightHandSide; - rightHandSide.parseUnicodeSet(pattern, chars, symbols, rebuiltPat, options, caseClosure, - depth + 1, ec); + rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); removeAll(rightHandSide); } else { // Not an operator, end of the Restriction. - chars.setPos(beforeOperator); return; } } } -void UnicodeSet::parseElements(const UnicodeString &pattern, - RuleCharacterIterator &chars, - const SymbolTable *symbols, +void UnicodeSet::parseElements(Lexer &lexer, UnicodeString &rebuiltPat, - uint32_t options, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { @@ -636,34 +703,33 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, // | escaped-element // Element ::= RangeElement // | string-literal - UBool escaped = false; - const UChar32 first = chars.next(charsOptions(options), escaped, ec); + const UChar32 first = lexer.lookahead().codePoint(ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped) { + if (!lexer.lookahead().escaped()) { switch (first) { case u'-': case u'&': case u'[': case u']': case u'^': - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, chars, ec); - // Unescaped '$' + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, lexer, ec); case u'{': { + lexer.lookahead().moveAfter(); rebuiltPat.append(u'{'); UnicodeString string; - UChar32 c; - while (!chars.atEnd()) { - c = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && c == u'}') { + while (!lexer.atEnd()) { + if (lexer.lookahead().acceptUnescaped('}')) { rebuiltPat.append(u'}'); add(string); return; } + const UChar32 c = lexer.lookahead().codePoint(ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + lexer.lookahead().moveAfter(); _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false); string.append(c); } - U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, chars, ec); + U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, lexer, ec); } case u'}': case u'$': @@ -672,35 +738,32 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, break; } } + lexer.lookahead().moveAfter(); _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false); RuleCharacterIterator::Pos beforeOperator; - chars.getPos(beforeOperator); - const UChar32 op = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (escaped || op != u'-' || getMatcherSymbol(symbols, op) != nullptr) { + if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) { // No operator, // Elements ::= Element - chars.setPos(beforeOperator); add(first); return; } // Here the grammar requires two tokens of lookahead to figure out whether the - the operator // of a Range or an UnescapedHyphenMinus in the enclosing Union. - const UChar32 ahead = chars.next(charsOptions(options), escaped, ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!escaped && ahead == u']') { + if (lexer.lookahead().oneMore().isUnescaped(u']')) { // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it. - chars.setPos(beforeOperator); add(first); return; } + // Consume the hyphen-minus. + lexer.lookahead().moveAfter(); // Elements ::= Range ::= RangeElement - RangeElement rebuiltPat.append(u'-'); - const UChar32 last = ahead; - if (getMatcherSymbol(symbols, last) != nullptr) { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec); + const UChar32 last = lexer.lookahead().codePoint(ec); + U_UNICODESET_RETURN_IF_ERROR(ec); + if (lexer.lookahead().standIn() != nullptr) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec); } - if (!escaped) { + if (!lexer.lookahead().escaped()) { switch (last) { case u'-': case u'&': @@ -708,17 +771,13 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, case u']': case u'^': case u'{': - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, chars, ec); + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec); case u'$': { // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union. // This is an extension. - RuleCharacterIterator::Pos afterDollar; - chars.getPos(afterDollar); - UChar32 c = chars.next(charsOptions(options), escaped, ec); - chars.setPos(afterDollar); - if (!escaped && c == u']') { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", c, chars, - ec); + if (lexer.lookahead().oneMore().isUnescaped(u']')) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", u']', + lexer, ec); } break; } @@ -728,10 +787,11 @@ void UnicodeSet::parseElements(const UnicodeString &pattern, break; } } + lexer.lookahead().moveAfter(); _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false); if (last <= first) { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("first < last in Range", - UnicodeString(last) + u"-" + UnicodeString(first), chars, ec); + U_UNICODESET_RETURN_WITH_PARSE_ERROR( + "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec); } add(first, last); return; From 34bc05d7bbe6751a49816c8949538969a5c83d95 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 21 Aug 2025 17:16:52 +0200 Subject: [PATCH 37/56] Drop some traces --- icu4c/source/common/uniset_props.cpp | 50 ---------------------------- 1 file changed, 50 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 46401a273b4e..0652194441fa 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -380,50 +380,6 @@ namespace { constexpr int32_t MAX_DEPTH = 100; -#if 0 -#define U_UNICODESET_TRACE(...) \ - struct UnicodeSetParserTrace { \ - char const *const symbol_; \ - int const depth_; \ - const UnicodeSet *const that_; \ - UnicodeSetParserTrace(char const *symbol, int depth, const UnicodeSet *that) \ - : symbol_(symbol), depth_(depth), that_(that) {} \ - ~UnicodeSetParserTrace() { \ - UnicodeString ahead; \ - std::string aheadUTF8; \ - printf("%s%s\n", std::string(depth_ * 4, ' ').c_str(), symbol_); \ - printf("%s\n", (UnicodeSet(*that_) \ - .complement() \ - .complement() \ - .toPattern(ahead) \ - .toUTF8String(aheadUTF8) \ - .c_str(),"")); \ - } \ - }; \ - UnicodeSetParserTrace unicodeSetParserTrace( \ - std::string_view("" __VA_ARGS__).empty() ? __func__ + 5 : ("" __VA_ARGS__), depth, this); \ - do { \ - char const *symbol = ("" __VA_ARGS__); \ - if (std::string_view(symbol).empty()) { \ - symbol = __func__ + 5; \ - } \ - UnicodeString ahead; \ - std::string aheadUTF8; \ - printf("%s%s > %s\n", std::string(depth * 4, ' ').c_str(), symbol, \ - (chars).lookahead(ahead, 60).toUTF8String(aheadUTF8).c_str()); \ - printf("%s\n", (UnicodeSet(*this) \ - .complement() \ - .complement() \ - .toPattern(ahead) \ - .toUTF8String(aheadUTF8) \ - .c_str(),"")); \ - } while (false) -#else -#define U_UNICODESET_TRACE(...) \ - do { \ - } while (false) -#endif - #define U_UNICODESET_RETURN_IF_ERROR(ec) \ do { \ constexpr std::string_view functionName = __func__;\ @@ -490,7 +446,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, int32_t depth, UErrorCode &ec) { clear(); - U_UNICODESET_TRACE(); if (depth > MAX_DEPTH) { U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(), @@ -507,7 +462,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, UnicodeString prettyPrintedPattern; if (lexer.resemblesPropertyPattern()) { // UnicodeSet ::= property-query | named-element - U_UNICODESET_TRACE("property-query | named-element"); lexer.getCharacterIterator().skipIgnored(lexer.charsOptions()); UnicodeSet propertyQuery; propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec); @@ -575,7 +529,6 @@ void UnicodeSet::parseUnion(Lexer &lexer, int32_t depth, bool &containsRestrictions, UErrorCode &ec) { - U_UNICODESET_TRACE(); // Union ::= Terms // | UnescapedHyphenMinus Terms // | Terms UnescapedHyphenMinus @@ -623,7 +576,6 @@ void UnicodeSet::parseTerm(Lexer &lexer, int32_t depth, bool &containsRestriction, UErrorCode &ec) { - U_UNICODESET_TRACE(); // Term ::= Elements // | Restriction if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') || @@ -643,7 +595,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { - U_UNICODESET_TRACE(); // Restriction ::= UnicodeSet // | Intersection ::= Restriction & UnicodeSet // | Difference ::= Restriction - UnicodeSet @@ -695,7 +646,6 @@ void UnicodeSet::parseElements(Lexer &lexer, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { - U_UNICODESET_TRACE(); // Elements ::= Element // | Range // Range ::= RangeElement - RangeElement From 5c44163384ac9fc889dc70383864a4a864a741f5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 21 Aug 2025 17:19:19 +0200 Subject: [PATCH 38/56] ifdef out the remaining traces --- icu4c/source/common/uniset_props.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 0652194441fa..7ec0ddb58944 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -380,6 +380,10 @@ namespace { constexpr int32_t MAX_DEPTH = 100; +#define U_DEBUGGING_UNICODESET_PARSING 0 + +#if U_DEBUGGING_UNICODESET_PARSING + #define U_UNICODESET_RETURN_IF_ERROR(ec) \ do { \ constexpr std::string_view functionName = __func__;\ @@ -407,6 +411,22 @@ constexpr int32_t MAX_DEPTH = 100; return; \ } while (false) +#else + +#define U_UNICODESET_RETURN_IF_ERROR(ec) \ + do { \ + if (U_FAILURE(ec)) { \ + return; \ + } \ + } while (false) +#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec) \ + do { \ + (ec) = U_MALFORMED_SET; \ + return; \ + } while (false) + +#endif + } // namespace /** From da4b123626e7f158b47f7b60296140e30ecf758c Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 21 Aug 2025 17:28:11 +0200 Subject: [PATCH 39/56] Remove the old code --- icu4c/source/common/uniset_props.cpp | 359 --------------------------- 1 file changed, 359 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 7ec0ddb58944..3d9774473c76 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -767,365 +767,6 @@ void UnicodeSet::parseElements(Lexer &lexer, return; } - #if 0 - while (mode != 2 && !chars.atEnd()) { - U_ASSERT((lastItem == 0 && op == 0) || - (lastItem == 1 && (op == 0 || op == u'-')) || - (lastItem == 2 && (op == 0 || op == u'-' || op == u'&'))); - - UChar32 c = 0; - UBool literal = false; - UnicodeSet* nested = nullptr; // alias - do not delete - - // -------- Check for property pattern - - // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed - int8_t setMode = 0; - if (resemblesPropertyPattern(chars, opts)) { - setMode = 2; - } - - // -------- Parse '[' of opening delimiter OR nested set. - // If there is a nested set, use `setMode' to define how - // the set should be parsed. If the '[' is part of the - // opening delimiter for this pattern, parse special - // strings "[", "[^", "[-", and "[^-". Check for stand-in - // characters representing a nested set in the symbol - // table. - - else { - // Prepare to backup if necessary - chars.getPos(backup); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - - if (c == u'[' && !literal) { - if (mode == 1) { - chars.setPos(backup); // backup - setMode = 1; - } else { - // Handle opening '[' delimiter - mode = 1; - patLocal.append(u'['); - chars.getPos(backup); // prepare to backup - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == u'^' && !literal) { - invert = true; - patLocal.append(u'^'); - chars.getPos(backup); // prepare to backup - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - } - // Fall through to handle special leading '-'; - // otherwise restart loop for nested [], \p{}, etc. - if (c == u'-') { - literal = true; - // Fall through to handle literal '-' below - } else { - chars.setPos(backup); // backup - continue; - } - } - } else if (symbols != nullptr) { - const UnicodeFunctor *m = symbols->lookupMatcher(c); - if (m != nullptr) { - const UnicodeSet *ms = dynamic_cast(m); - if (ms == nullptr) { - ec = U_MALFORMED_SET; - return; - } - // casting away const, but `nested' won't be modified - // (important not to modify stored set) - nested = const_cast(ms); - setMode = 3; - } - } - } - - // -------- Handle a nested set. This either is inline in - // the pattern or represented by a stand-in that has - // previously been parsed and was looked up in the symbol - // table. - - if (setMode != 0) { - if (lastItem == 1) { - if (op != 0) { - // syntaxError(chars, "Char expected after operator"); - ec = U_MALFORMED_SET; - return; - } - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, false); - lastItem = 0; - op = 0; - } - - if (op == u'-' || op == u'&') { - patLocal.append(op); - } - - if (nested == nullptr) { - // lazy allocation - if (!scratch.allocate()) { - ec = U_MEMORY_ALLOCATION_ERROR; - return; - } - nested = scratch.pointer(); - } - switch (setMode) { - case 1: - nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec); - break; - case 2: - chars.skipIgnored(opts); - nested->applyPropertyPattern(chars, patLocal, ec); - if (U_FAILURE(ec)) return; - break; - case 3: // `nested' already parsed - nested->_toPattern(patLocal, false); - break; - } - - usePat = true; - - if (mode == 0) { - // Entire pattern is a category; leave parse loop - *this = *nested; - mode = 2; - break; - } - - switch (op) { - case u'-': - removeAll(*nested); - break; - case u'&': - retainAll(*nested); - break; - case 0: - addAll(*nested); - break; - } - - op = 0; - lastItem = 2; - - continue; - } - - if (mode == 0) { - // syntaxError(chars, "Missing '['"); - ec = U_MALFORMED_SET; - return; - } - - // -------- Parse special (syntax) characters. If the - // current character is not special, or if it is escaped, - // then fall through and handle it below. - - if (!literal) { - switch (c) { - case u']': - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, false); - } - // Treat final trailing '-' as a literal - if (op == u'-') { - add(op, op); - patLocal.append(op); - } else if (op == u'&') { - // syntaxError(chars, "Trailing '&'"); - ec = U_MALFORMED_SET; - return; - } - patLocal.append(u']'); - mode = 2; - continue; - case u'-': - if (op == 0) { - if (lastItem != 0) { - op = static_cast(c); - continue; - } else { - // Treat final trailing '-' as a literal - add(c, c); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == u']' && !literal) { - patLocal.append(u"-]", 2); - mode = 2; - continue; - } - } - } - // syntaxError(chars, "'-' not after char or set"); - ec = U_MALFORMED_SET; - return; - case u'&': - if (lastItem == 2 && op == 0) { - op = static_cast(c); - continue; - } - // syntaxError(chars, "'&' not after set"); - ec = U_MALFORMED_SET; - return; - case u'^': - // syntaxError(chars, "'^' not after '['"); - ec = U_MALFORMED_SET; - return; - case u'{': - if (op != 0) { - // syntaxError(chars, "Missing operand after operator"); - ec = U_MALFORMED_SET; - return; - } - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, false); - } - lastItem = 0; - buf.truncate(0); - { - UBool ok = false; - while (!chars.atEnd()) { - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == u'}' && !literal) { - ok = true; - break; - } - buf.append(c); - } - if (!ok) { - // syntaxError(chars, "Invalid multicharacter string"); - ec = U_MALFORMED_SET; - return; - } - } - // We have new string. Add it to set and continue; - // we don't need to drop through to the further - // processing - add(buf); - patLocal.append(u'{'); - _appendToPat(patLocal, buf, false); - patLocal.append(u'}'); - continue; - case SymbolTable::SYMBOL_REF: - // symbols nosymbols - // [a-$] error error (ambiguous) - // [a$] anchor anchor - // [a-$x] var "x"* literal '$' - // [a-$.] error literal '$' - // *We won't get here in the case of var "x" - { - chars.getPos(backup); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - UBool anchor = (c == u']' && !literal); - if (symbols == nullptr && !anchor) { - c = SymbolTable::SYMBOL_REF; - chars.setPos(backup); - break; // literal '$' - } - if (anchor && op == 0) { - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, false); - } - add(U_ETHER); - usePat = true; - patLocal.append(static_cast(SymbolTable::SYMBOL_REF)); - patLocal.append(u']'); - mode = 2; - continue; - } - // syntaxError(chars, "Unquoted '$'"); - ec = U_MALFORMED_SET; - return; - } - default: - break; - } - } - - // -------- Parse literal characters. This includes both - // escaped chars ("\u4E01") and non-syntax characters - // ("a"). - - switch (lastItem) { - case 0: - lastItem = 1; - lastChar = c; - break; - case 1: - if (op == u'-') { - if (lastChar >= c) { - // Don't allow redundant (a-a) or empty (b-a) ranges; - // these are most likely typos. - // syntaxError(chars, "Invalid range"); - ec = U_MALFORMED_SET; - return; - } - add(lastChar, c); - _appendToPat(patLocal, lastChar, false); - patLocal.append(op); - _appendToPat(patLocal, c, false); - lastItem = 0; - op = 0; - } else { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, false); - lastChar = c; - } - break; - case 2: - if (op != 0) { - // syntaxError(chars, "Set expected after operator"); - ec = U_MALFORMED_SET; - return; - } - lastChar = c; - lastItem = 1; - break; - } - } - - if (mode != 2) { - // syntaxError(chars, "Missing ']'"); - ec = U_MALFORMED_SET; - return; - } - - chars.skipIgnored(opts); - - /** - * Handle global flags (invert, case insensitivity). If this - * pattern should be compiled case-insensitive, then we need - * to close over case BEFORE COMPLEMENTING. This makes - * patterns like /[^abc]/i work. - */ - if ((options & USET_CASE_MASK) != 0) { - (this->*caseClosure)(options); - } - if (invert) { - complement().removeAllStrings(); // code point complement - } - - // Use the rebuilt pattern (patLocal) only if necessary. Prefer the - // generated pattern. - if (usePat) { - rebuiltPat.append(patLocal); - } else { - _generatePattern(rebuiltPat, false); - } - if (isBogus() && U_SUCCESS(ec)) { - // We likely ran out of memory. AHHH! - ec = U_MEMORY_ALLOCATION_ERROR; - } -#endif - //---------------------------------------------------------------- // Property set implementation //---------------------------------------------------------------- From ff092dcabea44a42717f91dc0ef73067c2520593 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 21 Aug 2025 17:51:03 +0200 Subject: [PATCH 40/56] Unused variables --- icu4c/source/common/uniset_props.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 3d9774473c76..6706cb996a6a 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -627,7 +627,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer, // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and // return. for (;;) { - RuleCharacterIterator::Pos beforeOperator; if (lexer.lookahead().standIn() != nullptr) { // Not an operator, end of the Restriction. return; @@ -710,7 +709,6 @@ void UnicodeSet::parseElements(Lexer &lexer, } lexer.lookahead().moveAfter(); _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false); - RuleCharacterIterator::Pos beforeOperator; if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) { // No operator, // Elements ::= Element From f0bd37b67e6760e052f7fb6921d53d9b4447d0d4 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 26 Aug 2025 20:07:09 +0200 Subject: [PATCH 41/56] Some work towards a proper lexer --- icu4c/source/common/uniset_props.cpp | 377 +++++++++++++++++++-------- 1 file changed, 274 insertions(+), 103 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 6706cb996a6a..c3961ca2c3ae 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -235,35 +235,31 @@ class UnicodeSet::Lexer { : 0)), symbols_(symbols) {} - class Lookahead { + class LexicalElement { public: - bool isUnescaped(UChar32 codePoint) const { - return !escaped_ && codePoint_ == codePoint; + bool isPropertyQuery() const { + return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY; } - bool isUnescapedNotStandIn(UChar32 codePoint) { - return isUnescaped(codePoint) && standIn() == nullptr; + bool isNamedElement() const { + return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY; } - void moveAfter() { - lexer_.chars_.setPos(after_); - lexer_.ahead_.reset(); + bool isSetOperator(const char16_t op) const { + return U_SUCCESS(errorCode_) && category_ == SET_OPERATOR && string_[0] == op; } - bool acceptUnescapedNotStandIn(UChar32 codePoint) { - if (isUnescapedNotStandIn(codePoint)) { - moveAfter(); - return true; - } - return false; + bool isStringLiteral() const { + return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL; + } + + bool isBracketedElement() const { + return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT; } - bool acceptUnescaped(UChar32 codePoint) { - if (isUnescaped(codePoint)) { - moveAfter(); - return true; + std::optional element() const { + if (U_SUCCESS(errorCode_) && category_) { } - return false; } UChar32 codePoint(UErrorCode &errorCode) const { @@ -277,50 +273,37 @@ class UnicodeSet::Lexer { return escaped_; } - const UnicodeSet *standIn() { - if (!standIn_.has_value()) { - if (lexer_.symbols_ == nullptr) { - standIn_ = nullptr; - } else { - standIn_ = - dynamic_cast(lexer_.symbols_->lookupMatcher(codePoint_)); - } + const UnicodeSet *standIn() const { + if (U_FAILURE(errorCode_) || category_ != STAND_IN) { + return nullptr; } - return *standIn_; + return standIn_; }; - // Some parts of the grammar need two tokens of lookahead. The second lookahead is not cached. - Lookahead oneMore() { - return oneMore(lexer_.charsOptions_); - } - - Lookahead oneMore(int32_t charsOptions) { - RuleCharacterIterator::Pos before; - lexer_.chars_.getPos(before); - lexer_.chars_.setPos(after_); - auto const result = Lookahead(lexer_, lexer_.chars_, charsOptions); - lexer_.chars_.setPos(before); - return result; - } - - Lookahead(Lexer &lexer, RuleCharacterIterator &chars, int32_t charsOptions) - : lexer_(lexer) { - RuleCharacterIterator::Pos before; - chars.getPos(before); - codePoint_ = chars.next(charsOptions, escaped_, errorCode_); - chars.getPos(after_); - chars.setPos(before); - } - private: - Lexer &lexer_; + // See https://unicode.org/reports/tr61#Lexical-Elements. + enum Category { + SET_OPERATOR, + LITERAL_ELEMENT, + ESCAPED_ELEMENT, + NAMED_ELEMENT, + BRACKETED_ELEMENT, + STRING_LITERAL, + PROPERTY_QUERY, + // ICU extension: A literal-element, escaped-element, or set-operator or (but not + // bracketed-element) which is mapped to a set. This may also be an unescaped '{', in which + // case bracketed-element and string-literal are inaccessible. + STAND_IN, + }; + LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after, + UErrorCode errorCode, const UnicodeSet *standIn) + : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode), + standIn_(standIn) {} + Category category_; + UnicodeString string_; RuleCharacterIterator::Pos after_; UErrorCode errorCode_; - UChar32 codePoint_; - UBool escaped_; - // `std::nullopt` if we have not yet called `lookupMatcher`, otherwise the result of - // `lookupMatcher` (which may be `nullptr`). - std::optional standIn_; + const UnicodeSet *standIn_; friend class Lexer; }; @@ -330,32 +313,40 @@ class UnicodeSet::Lexer { pattern_.tempSubString(parsePosition_.getIndex(), 60); } - Lookahead &lookahead() { + const bool acceptSetOperator(char16_t op) { + if (lookahead().isSetOperator(op)) { + advance(); + return true; + } + return false; + } + + const LexicalElement &lookahead() { if (!ahead_.has_value()) { - ahead_.emplace(*this, chars_, charsOptions_); + const RuleCharacterIterator::Pos before = getPos(); + ahead_.emplace(nextToken()); + chars_.setPos(before); } return *ahead_; } - bool resemblesPropertyPattern() { - Lookahead first = - Lookahead(*this, chars_, charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES); - if (first.codePoint_ != u'[' && first.codePoint_ != u'\\') { - return false; + const LexicalElement &lookahead2() { + if (!ahead2_.has_value()) { + const RuleCharacterIterator::Pos before = getPos(); + chars_.setPos(lookahead().after_); + ahead_.emplace(nextToken()); + chars_.setPos(before); } - Lookahead second = first.oneMore(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | - RuleCharacterIterator::SKIP_WHITESPACE)); - return (first.codePoint_ == u'[' && second.codePoint_ == ':') || - (first.codePoint_ == u'\\' && - (second.codePoint_ == u'p' || second.codePoint_ == u'P' || second.codePoint_ == u'N')); + return *ahead_; } // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to - // `lookahead`. The resulting `RuleCharacterIterator` must not be used once `lookahead` has been - // called. + // `lookahead`. The resulting `RuleCharacterIterator` must not be used once `lookahead` or + // `lookahead2` has been called again. RuleCharacterIterator &getCharacterIterator() { ahead_.reset(); + ahead2_.reset(); return chars_; } @@ -367,13 +358,192 @@ class UnicodeSet::Lexer { return chars_.atEnd(); } + void advance() { + chars_.setPos(lookahead().after_); + ahead_ = ahead2_; + ahead2_.reset(); + } + private: + // A version of getPos that returns its position instead of taking it as at out parameter, so we + // can have const positions. + RuleCharacterIterator::Pos getPos() const { + RuleCharacterIterator::Pos result; + chars_.getPos(result); + return result; + } + + LexicalElement nextToken() { + UErrorCode errorCode = U_ZERO_ERROR; + const RuleCharacterIterator::Pos before = getPos(); + // First try to get the next character without parsing escapes. + UBool unusedEscaped; + const UChar32 first = + chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode); + // '[', named-element, and property-query cannot be disabled by stand-in. + if (first == u'[' || first == u'\\') { + // This could be a property-query or named-element. + const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if ((first == u'[' && second == u':') || + (first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) { + if (second == u'N') { + const UChar32 third = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if (third == u'{') { + while (!chars_.atEnd()) { + UChar32 last = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if (last == u'}') { + return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), + errorCode, + /*standIn=*/nullptr); + } + } + } + return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), U_MALFORMED_SET, + /*standIn=*/nullptr); + } + // Do not skip whitespace so we can recognize unspaced :]. Lex escapes and + // named-element: while ICU does not support string-valued properties and thus has no + // use for escapes, we still want to lex through escapes to allow downstream + // implementations (mostly unicodetools) to implement string-valued properties. + if (first == u'\\') { + const UChar32 third = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if (third != u'{') { + return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), + U_MALFORMED_SET, + /*standIn=*/nullptr); + } + } + RuleCharacterIterator::Pos beforePenultimate = getPos(); + UChar32 penultimateUnescaped = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + + while (!chars_.atEnd()) { + const RuleCharacterIterator::Pos beforeLast = getPos(); + UChar32 lastUnescaped = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if (penultimateUnescaped == u'\\') { + if (lastUnescaped == 'N') { + const UChar32 namedElementOpening = + chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + bool namedElementOK = false; + if (namedElementOpening == u'{') { + while (!chars_.atEnd()) { + UChar32 namedElementLast = chars_.next( + charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), + unusedEscaped, errorCode); + if (namedElementLast == u'}') { + namedElementOK = true; + } + } + } + if (!namedElementOK) { + return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), + U_MALFORMED_SET, + /*standIn=*/nullptr); + } + } else { + // There must be an escaped-element starting at beforePenultimate. Go + // back there and advance through it. + chars_.setPos(beforePenultimate); + chars_.next(charsOptions_ & ~RuleCharacterIterator::SKIP_WHITESPACE, + unusedEscaped, errorCode); + } + // Neither a named-element nor an escaped-element can be part of a closing :]. + lastUnescaped = -1; + } else if ((first == u'[' && penultimateUnescaped == u':' && + lastUnescaped == u']') || + (first == u'\\' && lastUnescaped == u'}')) { + return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode, + /*standIn=*/nullptr); + } + beforePenultimate = beforeLast; + penultimateUnescaped = lastUnescaped; + } + return; + } + } + if (first == u'[') { + return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode, + /*standIn=*/nullptr); + } + + if (first == u'\\') { + // Now try to parse the escape. + chars_.setPos(before); + UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode); + const UnicodeSet *const standIn = + dynamic_cast(symbols_->lookupMatcher(codePoint)); + return LexicalElement(standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT + : LexicalElement::STAND_IN, + standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), + getPos(), + errorCode, standIn); + } + if (const UnicodeSet *const standIn = + dynamic_cast(symbols_->lookupMatcher(first)); + standIn != nullptr) { + return LexicalElement(LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn); + } + + switch (first) { + case u'&': + case u'-': + case u'[': + case u']': + case u'^': + case u'$': + // We make $ a set-operator to handle the ICU extensions involving $. + return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), + errorCode, + /*standIn=*/nullptr); + case u'{': { + UnicodeString string; + UBool escaped; + UChar32 next; + while (!chars_.atEnd()) { + next = chars_.next(charsOptions_, escaped, errorCode); + if (!escaped && next == u'}') { + return LexicalElement(string.length() == 1 ? LexicalElement::BRACKETED_ELEMENT + : LexicalElement::STRING_LITERAL, + std::move(string), getPos(), errorCode, + /*standIn=*/nullptr); + } + string.append(next); + } + return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET, + /*standIn=*/nullptr); + } + default: + return LexicalElement(LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), + errorCode, nullptr); + } + } + const UnicodeString &pattern_; const ParsePosition &parsePosition_; RuleCharacterIterator &chars_; const int32_t charsOptions_; const SymbolTable *const symbols_; - std::optional ahead_; + std::optional ahead_; + std::optional ahead2_; }; namespace { @@ -480,8 +650,11 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, bool preserveSyntaxInPattern = false; // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc. UnicodeString prettyPrintedPattern; - if (lexer.resemblesPropertyPattern()) { + if (lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) { // UnicodeSet ::= property-query | named-element + // NOTE(egg): For now, we throw away the work that the lexer did to find out where the + // property-query or named-element ended in order to retain the existing buggy behaviour of + // variables containing property queries. lexer.getCharacterIterator().skipIgnored(lexer.charsOptions()); UnicodeSet propertyQuery; propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec); @@ -493,31 +666,30 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, // UnicodeSet ::= [ Union ] // | Complement ::= [ ^ Union ] // Extension: - // | MatcherSymbol - // Where a MatcherSymbol may be a character or an escape. - // Strings that would match MatcherSymbol effectively get removed from + // | stand-in + // Where a stand-in may be a character or an escape. + // Strings that would match stand-in effectively get removed from // all other terminals of the grammar, except [. - if (lexer.lookahead().acceptUnescaped(u'[')) { + if (lexer.acceptSetOperator(u'[')) { prettyPrintedPattern.append(u'['); - if (lexer.lookahead().acceptUnescaped(u'^')) { + if (lexer.acceptSetOperator(u'^')) { prettyPrintedPattern.append(u'^'); isComplement = true; } parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth, /*containsRestrictions=*/preserveSyntaxInPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); - if (!lexer.lookahead().acceptUnescaped(u']')) { + if (!lexer.acceptSetOperator(u']')) { U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec); } prettyPrintedPattern.append(u']'); - } else { - const UnicodeSet *set = lexer.lookahead().standIn(); - if (set != nullptr) { - *this = *set; - this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false); - lexer.lookahead().moveAfter(); + } else if (const UnicodeSet *const standIn = lexer.lookahead().standIn(); + standIn != nullptr) { + *this = *standIn; + this->_toPattern(rebuiltPat, /*escapeUnprintable=*/false); + lexer.advance(); return; - } + } else { U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", lexer.lookahead().codePoint(ec), lexer, ec); } @@ -555,33 +727,32 @@ void UnicodeSet::parseUnion(Lexer &lexer, // | UnescapedHyphenMinus Terms UnescapedHyphenMinus // Terms ::= "" // | Terms Term - if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) { + if (lexer.acceptSetOperator(u'-')) { add(u'-'); // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a // final one, for consistency with older ICU behaviour. rebuiltPat.append(u"\\-"); } while (!lexer.atEnd()) { - if (lexer.lookahead().acceptUnescapedNotStandIn(u'-')) { + if (lexer.acceptSetOperator(u'-')) { // We can be here on the first iteration: [--] is allowed by the // grammar and by the old parser. rebuiltPat.append(u'-'); add(u'-'); return; - } else if (lexer.lookahead().isUnescapedNotStandIn(u'$')) { - Lexer::Lookahead afterDollar = lexer.lookahead().oneMore(); - if (afterDollar.isUnescaped(u']')) { + } else if (lexer.lookahead().isSetOperator(u'$')) { + if (lexer.lookahead2().isSetOperator(u']')) { // ICU extensions: A $ is allowed as a literal-element. // A Term at the end of a Union consisting of a single $ is an anchor. rebuiltPat.append(u'$'); // Consume the dollar. - lexer.lookahead().moveAfter(); + lexer.advance(); add(U_ETHER); containsRestrictions = true; return; } } - if (lexer.lookahead().isUnescapedNotStandIn(u']')) { + if (lexer.lookahead().isSetOperator(u']')) { return; } parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec); @@ -598,8 +769,8 @@ void UnicodeSet::parseTerm(Lexer &lexer, UErrorCode &ec) { // Term ::= Elements // | Restriction - if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isUnescaped('[') || - lexer.resemblesPropertyPattern()) { + if (lexer.lookahead().standIn() != nullptr || lexer.lookahead().isSetOperator('[') || + lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) { containsRestriction = true; parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); @@ -627,26 +798,22 @@ void UnicodeSet::parseRestriction(Lexer &lexer, // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and // return. for (;;) { - if (lexer.lookahead().standIn() != nullptr) { - // Not an operator, end of the Restriction. - return; - } - if (lexer.lookahead().acceptUnescaped(u'&')) { + if (lexer.acceptSetOperator(u'&')) { // Intersection ::= Restriction & UnicodeSet rebuiltPat.append(u'&'); UnicodeSet rightHandSide; rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); U_UNICODESET_RETURN_IF_ERROR(ec); retainAll(rightHandSide); - } else if (lexer.lookahead().isUnescaped(u'-')) { + } else if (lexer.lookahead().isSetOperator(u'-')) { // Here the grammar requires two tokens of lookahead to figure out whether the - the operator // of a Difference or an UnescapedHyphenMinus in the enclosing Union. - if (lexer.lookahead().oneMore().isUnescaped(u']')) { + if (lexer.lookahead2().isSetOperator(u']')) { // The operator is actually an UnescapedHyphenMinus; terminate the Restriction before it. return; } // Consume the hyphen-minus. - lexer.lookahead().moveAfter(); + lexer.advance(); // Difference ::= Restriction - UnicodeSet rebuiltPat.append(u'-'); UnicodeSet rightHandSide; @@ -672,6 +839,10 @@ void UnicodeSet::parseElements(Lexer &lexer, // | escaped-element // Element ::= RangeElement // | string-literal + // | bracketed-element + if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) { + add(lexer.lookahead().) + } const UChar32 first = lexer.lookahead().codePoint(ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (!lexer.lookahead().escaped()) { From b78c0ce1364c3c9d727b722082c3970a25efabb0 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 13:45:46 +0200 Subject: [PATCH 42/56] A proper lexer --- icu4c/source/common/uniset_props.cpp | 315 ++++++++++++++++----------- 1 file changed, 184 insertions(+), 131 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index c3961ca2c3ae..63356dcc2b11 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -18,6 +18,9 @@ * Character property dependent functions moved here from uniset.cpp */ +#include +#include + #include "unicode/utypes.h" #include "unicode/uniset.h" #include "unicode/parsepos.h" @@ -44,7 +47,6 @@ #include "umutex.h" #include "uassert.h" #include "hash.h" -#include U_NAMESPACE_USE @@ -242,7 +244,7 @@ class UnicodeSet::Lexer { } bool isNamedElement() const { - return U_SUCCESS(errorCode_) && category_ == PROPERTY_QUERY; + return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT; } bool isSetOperator(const char16_t op) const { @@ -258,19 +260,20 @@ class UnicodeSet::Lexer { } std::optional element() const { - if (U_SUCCESS(errorCode_) && category_) { + if (U_SUCCESS(errorCode_) && + (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT || + category_ == BRACKETED_ELEMENT || category_ == STRING_LITERAL)) { + return string_; } + return std::nullopt; } - UChar32 codePoint(UErrorCode &errorCode) const { - if (!U_FAILURE(errorCode)) { - errorCode = errorCode; + std::optional codePoint() const { + if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT || + category_ == BRACKETED_ELEMENT)) { + return string_.char32At(0); } - return codePoint_; - } - - bool escaped() const { - return escaped_; + return std::nullopt; } const UnicodeSet *standIn() const { @@ -278,11 +281,29 @@ class UnicodeSet::Lexer { return nullptr; } return standIn_; - }; + } + + const UErrorCode& errorCode() const{ + return errorCode_; + } + + UnicodeString debugString() const { + UnicodeString result; + if (U_FAILURE(errorCode_)) { + result.append(u"Ill-formed token ("); + result.append(UnicodeString::fromUTF8(u_errorName(errorCode_))); + result.append(u"), possibly "); + } + result.append(category_names_[category_]); + result.append(u" '"); + result.append(sourceText_); + result.append(u"'"); + return result; + } private: // See https://unicode.org/reports/tr61#Lexical-Elements. - enum Category { + enum Category : std::uint8_t { SET_OPERATOR, LITERAL_ELEMENT, ESCAPED_ELEMENT, @@ -295,15 +316,26 @@ class UnicodeSet::Lexer { // case bracketed-element and string-literal are inaccessible. STAND_IN, }; + static constexpr std::array category_names_{{ + u"set-operator", + u"literal-element", + u"escaped-element", + u"named-element", + u"bracketed-element", + u"string-literal", + u"property-query", + u"stand-in", + }}; LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after, - UErrorCode errorCode, const UnicodeSet *standIn) + UErrorCode errorCode, const UnicodeSet *standIn, std::u16string_view sourceText) : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode), - standIn_(standIn) {} + standIn_(standIn), sourceText_(sourceText) {} Category category_; UnicodeString string_; RuleCharacterIterator::Pos after_; UErrorCode errorCode_; const UnicodeSet *standIn_; + std::u16string_view sourceText_; friend class Lexer; }; @@ -332,21 +364,27 @@ class UnicodeSet::Lexer { const LexicalElement &lookahead2() { if (!ahead2_.has_value()) { + // Note that if someone has called `getCharacterIterator` and played with the result, + // `before` may not actually be before `ahead_`, but we do not actually depend on this here, + // since we start from ahead_.after_. const RuleCharacterIterator::Pos before = getPos(); chars_.setPos(lookahead().after_); - ahead_.emplace(nextToken()); + ahead2_.emplace(nextToken()); chars_.setPos(before); } - return *ahead_; + return *ahead2_; } - // For use in functions that take the `RuleCharacterIterator` directly; clears the lookahead cache so - // that any advancement of the `RuleCharacterIterator` is taken into account by subsequent calls to - // `lookahead`. The resulting `RuleCharacterIterator` must not be used once `lookahead` or - // `lookahead2` has been called again. + // For use in older functions that take the `RuleCharacterIterator` directly. + // Any advancement of the resulting `RuleCharacterIterator` has no effect on the result of subsequent + // calls to `lookahead`, `lookahead2`, `advance`, or `acceptSetOperator`. + // Once `advance` or `acceptSetOperator` has been called, the result of a call to + // `getCharacterIterator` preceding the call to `advance` or `acceptSetOperator` must no longer be + // used. RuleCharacterIterator &getCharacterIterator() { - ahead_.reset(); - ahead2_.reset(); + // Make sure we compute a correct `ahead_.after_` so we do not depend on the current value of + // `getPos()` for lexing. + lookahead(); return chars_; } @@ -359,6 +397,10 @@ class UnicodeSet::Lexer { } void advance() { + // If someone called `getCharacterIterator`, we are now changing the character iterator under + // their feet; further, we may not have an `ahead_`, so if they keep playing with it we would be + // working on incorrect values of `getPos`. This is why the result of `getCharacterIterator` + // must no longer be used. chars_.setPos(lookahead().after_); ahead_ = ahead2_; ahead2_.reset(); @@ -375,6 +417,7 @@ class UnicodeSet::Lexer { LexicalElement nextToken() { UErrorCode errorCode = U_ZERO_ERROR; + const int32_t start = parsePosition_.getIndex(); const RuleCharacterIterator::Pos before = getPos(); // First try to get the next character without parsing escapes. UBool unusedEscaped; @@ -382,6 +425,7 @@ class UnicodeSet::Lexer { chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode); // '[', named-element, and property-query cannot be disabled by stand-in. if (first == u'[' || first == u'\\') { + const RuleCharacterIterator::Pos afterFirst = getPos(); // This could be a property-query or named-element. const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | RuleCharacterIterator::SKIP_WHITESPACE), @@ -402,12 +446,16 @@ class UnicodeSet::Lexer { if (last == u'}') { return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), errorCode, - /*standIn=*/nullptr); + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr( + start, parsePosition_.getIndex() - start)); } } } - return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), U_MALFORMED_SET, - /*standIn=*/nullptr); + return LexicalElement( + LexicalElement::NAMED_ELEMENT, {}, getPos(), U_ILLEGAL_ARGUMENT_ERROR, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } // Do not skip whitespace so we can recognize unspaced :]. Lex escapes and // named-element: while ICU does not support string-valued properties and thus has no @@ -420,8 +468,10 @@ class UnicodeSet::Lexer { unusedEscaped, errorCode); if (third != u'{') { return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), - U_MALFORMED_SET, - /*standIn=*/nullptr); + U_ILLEGAL_ARGUMENT_ERROR, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr( + start, parsePosition_.getIndex() - start)); } } RuleCharacterIterator::Pos beforePenultimate = getPos(); @@ -456,8 +506,10 @@ class UnicodeSet::Lexer { } if (!namedElementOK) { return LexicalElement(LexicalElement::NAMED_ELEMENT, {}, getPos(), - U_MALFORMED_SET, - /*standIn=*/nullptr); + U_ILLEGAL_ARGUMENT_ERROR, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr( + start, parsePosition_.getIndex() - start)); } } else { // There must be an escaped-element starting at beforePenultimate. Go @@ -471,18 +523,27 @@ class UnicodeSet::Lexer { } else if ((first == u'[' && penultimateUnescaped == u':' && lastUnescaped == u']') || (first == u'\\' && lastUnescaped == u'}')) { - return LexicalElement(LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode, - /*standIn=*/nullptr); + return LexicalElement( + LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } beforePenultimate = beforeLast; penultimateUnescaped = lastUnescaped; } - return; + return LexicalElement( + LexicalElement::PROPERTY_QUERY, {}, getPos(), U_ILLEGAL_ARGUMENT_ERROR, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } + // Not a property-query. + chars_.setPos(afterFirst); } if (first == u'[') { - return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode, - /*standIn=*/nullptr); + return LexicalElement( + LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } if (first == u'\\') { @@ -490,50 +551,62 @@ class UnicodeSet::Lexer { chars_.setPos(before); UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode); const UnicodeSet *const standIn = - dynamic_cast(symbols_->lookupMatcher(codePoint)); - return LexicalElement(standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT - : LexicalElement::STAND_IN, - standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), - getPos(), - errorCode, standIn); + symbols_ == nullptr + ? nullptr + : dynamic_cast(symbols_->lookupMatcher(codePoint)); + return LexicalElement( + standIn == nullptr ? LexicalElement::ESCAPED_ELEMENT : LexicalElement::STAND_IN, + standIn == nullptr ? UnicodeString(codePoint) : UnicodeString(), getPos(), errorCode, + standIn, std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } - if (const UnicodeSet *const standIn = + if (symbols_ != nullptr) { + const UnicodeSet *const standIn = dynamic_cast(symbols_->lookupMatcher(first)); - standIn != nullptr) { - return LexicalElement(LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn); + if (standIn != nullptr) { + return LexicalElement( + LexicalElement::STAND_IN, {}, getPos(), errorCode, standIn, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); + } } switch (first) { case u'&': case u'-': - case u'[': case u']': case u'^': case u'$': // We make $ a set-operator to handle the ICU extensions involving $. - return LexicalElement(LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), - errorCode, - /*standIn=*/nullptr); + return LexicalElement( + LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), errorCode, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); case u'{': { UnicodeString string; UBool escaped; UChar32 next; + int32_t codePointCount = 0; while (!chars_.atEnd()) { next = chars_.next(charsOptions_, escaped, errorCode); if (!escaped && next == u'}') { - return LexicalElement(string.length() == 1 ? LexicalElement::BRACKETED_ELEMENT - : LexicalElement::STRING_LITERAL, - std::move(string), getPos(), errorCode, - /*standIn=*/nullptr); + return LexicalElement( + codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT + : LexicalElement::STRING_LITERAL, + std::move(string), getPos(), errorCode, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } string.append(next); + codePointCount += 1; } - return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET, - /*standIn=*/nullptr); + return LexicalElement( + LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET, + /*standIn=*/nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } default: - return LexicalElement(LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), - errorCode, nullptr); + return LexicalElement( + LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr, + std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); } } @@ -550,7 +623,7 @@ namespace { constexpr int32_t MAX_DEPTH = 100; -#define U_DEBUGGING_UNICODESET_PARSING 0 +#define U_DEBUGGING_UNICODESET_PARSING 1 #if U_DEBUGGING_UNICODESET_PARSING @@ -573,11 +646,15 @@ constexpr int32_t MAX_DEPTH = 100; static_assert(functionName.substr(0, 5) == "parse"); \ std::string actualUTF8; \ std::string contextUTF8; \ - printf("*** Expected %s, got '%s' %s\n", (expected), \ + printf("*** Expected %s, got %s %s\n", (expected), \ UnicodeString(actual).toUTF8String(actualUTF8).c_str(), \ lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str()); \ printf("--- in %s l. %d\n", __func__ + 5, __LINE__); \ - (ec) = U_MALFORMED_SET; \ + if (U_FAILURE(lexer.lookahead().errorCode())) { \ + (ec) = lexer.lookahead().errorCode(); \ + } else { \ + (ec) = U_MALFORMED_SET; \ + } \ return; \ } while (false) @@ -591,7 +668,11 @@ constexpr int32_t MAX_DEPTH = 100; } while (false) #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec) \ do { \ - (ec) = U_MALFORMED_SET; \ + if (U_FAILURE(lexer.lookahead().errorCode())) { \ + (ec) = lexer.lookahead().errorCode(); \ + } else { \ + (ec) = U_MALFORMED_SET; \ + } \ return; \ } while (false) @@ -652,13 +733,16 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, UnicodeString prettyPrintedPattern; if (lexer.lookahead().isPropertyQuery() || lexer.lookahead().isNamedElement()) { // UnicodeSet ::= property-query | named-element - // NOTE(egg): For now, we throw away the work that the lexer did to find out where the + // NOTE(egg): For now, we ignore the work that the lexer did to find out where the // property-query or named-element ended in order to retain the existing buggy behaviour of // variables containing property queries. lexer.getCharacterIterator().skipIgnored(lexer.charsOptions()); UnicodeSet propertyQuery; propertyQuery.applyPropertyPattern(lexer.getCharacterIterator(), prettyPrintedPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); + // But now, we go back to our lexing and advance through the property-query or named-element as + // lexed. If there was no error, the old and the new code should agree on the extent. + lexer.advance(); addAll(propertyQuery); preserveSyntaxInPattern = true; } else { @@ -680,7 +764,7 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, /*containsRestrictions=*/preserveSyntaxInPattern, ec); U_UNICODESET_RETURN_IF_ERROR(ec); if (!lexer.acceptSetOperator(u']')) { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().codePoint(ec), lexer, ec); + U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().debugString(), lexer, ec); } prettyPrintedPattern.append(u']'); } else if (const UnicodeSet *const standIn = lexer.lookahead().standIn(); @@ -691,7 +775,8 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, return; } else { U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", - lexer.lookahead().codePoint(ec), lexer, ec); + lexer.lookahead().debugString(), lexer, + ec); } } @@ -841,46 +926,27 @@ void UnicodeSet::parseElements(Lexer &lexer, // | string-literal // | bracketed-element if (lexer.lookahead().isBracketedElement() || lexer.lookahead().isStringLiteral()) { - add(lexer.lookahead().) + add(*lexer.lookahead().element()); + rebuiltPat.append(u'{'); + _appendToPat(rebuiltPat, *lexer.lookahead().element(), /*escapeUnprintable=*/false); + rebuiltPat.append(u'}'); + lexer.advance(); + return; } - const UChar32 first = lexer.lookahead().codePoint(ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (!lexer.lookahead().escaped()) { - switch (first) { - case u'-': - case u'&': - case u'[': - case u']': - case u'^': - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", first, lexer, ec); - case u'{': { - lexer.lookahead().moveAfter(); - rebuiltPat.append(u'{'); - UnicodeString string; - while (!lexer.atEnd()) { - if (lexer.lookahead().acceptUnescaped('}')) { - rebuiltPat.append(u'}'); - add(string); - return; - } - const UChar32 c = lexer.lookahead().codePoint(ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - lexer.lookahead().moveAfter(); - _appendToPat(rebuiltPat, c, /*escapeUnprintable=*/false); - string.append(c); - } - U_UNICODESET_RETURN_WITH_PARSE_ERROR("}", RuleCharacterIterator::DONE, lexer, ec); - } - case u'}': - case u'$': - // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. - default: - break; - } + UChar32 first; + if (lexer.lookahead().isSetOperator(u'$')) { + // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. + first = u'$'; + } else if (lexer.lookahead().codePoint().has_value()) { + first = *lexer.lookahead().codePoint(); + } else { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal | bracketed-element", + lexer.lookahead().debugString(), + lexer, ec); } - lexer.lookahead().moveAfter(); + lexer.advance(); _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false); - if (!lexer.lookahead().isUnescapedNotStandIn(u'-')) { + if (!lexer.lookahead().isSetOperator(u'-')) { // No operator, // Elements ::= Element add(first); @@ -888,50 +954,37 @@ void UnicodeSet::parseElements(Lexer &lexer, } // Here the grammar requires two tokens of lookahead to figure out whether the - the operator // of a Range or an UnescapedHyphenMinus in the enclosing Union. - if (lexer.lookahead().oneMore().isUnescaped(u']')) { + if (lexer.lookahead2().isSetOperator(u']')) { // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it. add(first); return; } // Consume the hyphen-minus. - lexer.lookahead().moveAfter(); + lexer.advance(); // Elements ::= Range ::= RangeElement - RangeElement rebuiltPat.append(u'-'); - const UChar32 last = lexer.lookahead().codePoint(ec); - U_UNICODESET_RETURN_IF_ERROR(ec); - if (lexer.lookahead().standIn() != nullptr) { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec); - } - if (!lexer.lookahead().escaped()) { - switch (last) { - case u'-': - case u'&': - case u'[': - case u']': - case u'^': - case u'{': - U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", last, lexer, ec); - case u'$': { - // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union. - // This is an extension. - if (lexer.lookahead().oneMore().isUnescaped(u']')) { - U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", u']', - lexer, ec); - } - break; - } - case u'}': - // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. - default: - break; + UChar32 last; + if (lexer.lookahead().isSetOperator(u'$')) { + // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union. + // This is an extension. + last = u'$'; + if (lexer.lookahead2().isSetOperator(u']')) { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", + lexer.lookahead().debugString() + u" followed by " + + lexer.lookahead2().debugString(), + lexer, ec); } + } else if (lexer.lookahead().codePoint().has_value()) { + last = *lexer.lookahead().codePoint(); + } else { + U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec); } - lexer.lookahead().moveAfter(); - _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false); if (last <= first) { U_UNICODESET_RETURN_WITH_PARSE_ERROR( "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec); } + lexer.advance(); + _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false); add(first, last); return; } From d61b09076bcb1cc2b9728d13143a337dddaa76c5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 13:54:37 +0200 Subject: [PATCH 43/56] =?UTF-8?q?Don=E2=80=99t=20report=20end=20of=20text?= =?UTF-8?q?=20as=20a=20literal-element?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- icu4c/source/common/unicode/uniset.h | 3 --- icu4c/source/common/uniset_props.cpp | 36 +++++++--------------------- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index d805fd9e8156..538eb264e974 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1799,9 +1799,6 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { static UBool resemblesPropertyPattern(const UnicodeString& pattern, int32_t pos); - static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, - int32_t iterOpts); - /** * Parse the given property pattern at the given parse position * and set this UnicodeSet to the result. diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 63356dcc2b11..0de278955e74 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -315,8 +315,9 @@ class UnicodeSet::Lexer { // bracketed-element) which is mapped to a set. This may also be an unescaped '{', in which // case bracketed-element and string-literal are inaccessible. STAND_IN, + END_OF_TEXT, }; - static constexpr std::array category_names_{{ + static constexpr std::array category_names_{{ u"set-operator", u"literal-element", u"escaped-element", @@ -325,6 +326,7 @@ class UnicodeSet::Lexer { u"string-literal", u"property-query", u"stand-in", + u"(end of text)", }}; LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after, UErrorCode errorCode, const UnicodeSet *standIn, std::u16string_view sourceText) @@ -417,6 +419,11 @@ class UnicodeSet::Lexer { LexicalElement nextToken() { UErrorCode errorCode = U_ZERO_ERROR; + chars_.skipIgnored(charsOptions_); + if (chars_.atEnd()) { + return LexicalElement(LexicalElement::END_OF_TEXT, {}, getPos(), errorCode, + /*standIn=*/nullptr, u""); + } const int32_t start = parsePosition_.getIndex(); const RuleCharacterIterator::Pos before = getPos(); // First try to get the next character without parsing escapes. @@ -1337,33 +1344,6 @@ UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); } -/** - * Return true if the given iterator appears to point at a - * property pattern. Regardless of the result, return with the - * iterator unchanged. - * @param chars iterator over the pattern characters. Upon return - * it will be unchanged. - * @param iterOpts RuleCharacterIterator options - */ -UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, - int32_t iterOpts) { - // NOTE: literal will always be false, because we don't parse escapes. - UBool result = false, literal; - UErrorCode ec = U_ZERO_ERROR; - iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; - RuleCharacterIterator::Pos pos; - chars.getPos(pos); - UChar32 c = chars.next(iterOpts, literal, ec); - if (c == u'[' || c == u'\\') { - UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, - literal, ec); - result = (c == u'[') ? (d == u':') : - (d == u'N' || d == u'p' || d == u'P'); - } - chars.setPos(pos); - return result && U_SUCCESS(ec); -} - /** * Parse the given property pattern at the given parse position. */ From 40460d9dbd5cfbdaa393c9e7f302a7d840e8a2af Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 13:57:57 +0200 Subject: [PATCH 44/56] Turn off traces --- icu4c/source/common/uniset_props.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 0de278955e74..65df7a005384 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -630,7 +630,7 @@ namespace { constexpr int32_t MAX_DEPTH = 100; -#define U_DEBUGGING_UNICODESET_PARSING 1 +#define U_DEBUGGING_UNICODESET_PARSING 0 #if U_DEBUGGING_UNICODESET_PARSING From e39c4d1e4ece506b8719e592c630fba46635545e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 14:39:21 +0200 Subject: [PATCH 45/56] ICU-23179 Test more edge cases when mapping syntax characters to sets --- icu4c/source/test/intltest/usettest.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 3a97c7a4db01..841e94cc8030 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -2044,6 +2044,9 @@ void UnicodeSetTest::TestLookupSymbolTable() { symbols.add(u'{', UnicodeSet(u"[{leftCurlyBracket}]", errorCode)); symbols.add(u'}', UnicodeSet(u"[{rightCurlyBracket}]", errorCode)); symbols.add(u'$', UnicodeSet(u"[{dollarSign}]", errorCode)); + symbols.add(u':', UnicodeSet(u"[{colon}]", errorCode)); + symbols.add(u'\\', UnicodeSet(u"[{reverseSolidus}]", errorCode)); + symbols.add(u'p', UnicodeSet(u"[{latinSmallLetterP}]", errorCode)); for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, expectedLookups, variables] : std::vector{ {u"-", U_ZERO_ERROR, u"[{hyphenMinus}]", u"[{hyphenMinus}]"}, @@ -2051,11 +2054,15 @@ void UnicodeSetTest::TestLookupSymbolTable() { // The hyphen no longer works as set difference. {u"[0-1]", U_ZERO_ERROR, u"[[a-z][{hyphenMinus}][bc]]", u"[a-z{hyphenMinus}]"}, {u"[!-0]", U_ZERO_ERROR, u"[![{hyphenMinus}][a-z]]", u"[!a-z{hyphenMinus}]"}, + // An initial HYPHEN-MINUS is still treated as a literal '-', but a final one is treated + // as a set. + {u"[-1]", U_ZERO_ERROR, uR"([\-[bc]])", uR"([\-bc])"}, + {u"[1-]", U_ZERO_ERROR, u"[[bc][{hyphenMinus}]]", u"[bc{hyphenMinus}]"}, // String literals no longer work. {uR"([!-/{0}])", U_ZERO_ERROR, u"[![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]", u"[!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]"}, - // The ampersand no longer works as set difference. + // The ampersand no longer works as set intersection. {uR"([ 2 & 1 ])", U_ZERO_ERROR, u"[[: Co :][{ampersand}][bc]]", u"[bc-󰀀-󿿽􀀀-􏿽{ampersand}]"}, // Complementing still works. @@ -2070,6 +2077,9 @@ void UnicodeSetTest::TestLookupSymbolTable() { {uR"([ \[ ])", U_ZERO_ERROR, uR"([[{leftSquareBracket}]])", uR"([{leftSquareBracket}])"}, // Anchors are gone. {uR"([$])", U_ZERO_ERROR, uR"([[{dollarSign}]])", uR"([{dollarSign}])"}, + // Property queries are unaffected. + {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, + {uR"(\p{Co})", U_ZERO_ERROR, uR"(\p{Co})", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, }) { UnicodeString actual; UErrorCode errorCode = U_ZERO_ERROR; @@ -2094,6 +2104,7 @@ void UnicodeSetTest::TestLookupSymbolTable() { for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern, expectedLookups, variables] : std::vector{ {u"]", U_ZERO_ERROR, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, + {u"[:Co:]", U_ZERO_ERROR, u"[:Co:]", u"[\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD]"}, {u"[]", U_MALFORMED_SET, u"[{rightSquareBracket}]", u"[{rightSquareBracket}]"}, }) { UnicodeString actual; From 93d92964cbea164d18a7844c32973dd29f331f8f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 14:49:10 +0200 Subject: [PATCH 46/56] Deal with the ambiguous - and ^ --- icu4c/source/common/uniset_props.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 65df7a005384..f44d89feffd1 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -355,6 +355,14 @@ class UnicodeSet::Lexer { return false; } + const bool acceptStandInWithSymbol(char16_t op) { + if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) { + advance(); + return true; + } + return false; + } + const LexicalElement &lookahead() { if (!ahead_.has_value()) { const RuleCharacterIterator::Pos before = getPos(); @@ -761,9 +769,11 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, // Where a stand-in may be a character or an escape. // Strings that would match stand-in effectively get removed from // all other terminals of the grammar, except [. + // When mapped by the symbol table, whether ^ and - are treated as set operators depends on where + // in the grammar we are, hence `acceptStandInWithSymbol`. if (lexer.acceptSetOperator(u'[')) { prettyPrintedPattern.append(u'['); - if (lexer.acceptSetOperator(u'^')) { + if (lexer.acceptSetOperator(u'^') || lexer.acceptStandInWithSymbol(u'^')) { prettyPrintedPattern.append(u'^'); isComplement = true; } @@ -819,13 +829,15 @@ void UnicodeSet::parseUnion(Lexer &lexer, // | UnescapedHyphenMinus Terms UnescapedHyphenMinus // Terms ::= "" // | Terms Term - if (lexer.acceptSetOperator(u'-')) { + if (lexer.acceptSetOperator(u'-') || lexer.acceptStandInWithSymbol(u'-')) { add(u'-'); // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a // final one, for consistency with older ICU behaviour. rebuiltPat.append(u"\\-"); } while (!lexer.atEnd()) { + // Note that while a HYPHEN-MINUS mapped by the symbol table is treated as a literal at the + // beginning of the Union, it is treated as a set elsewhere, including at the end. if (lexer.acceptSetOperator(u'-')) { // We can be here on the first iteration: [--] is allowed by the // grammar and by the old parser. From 7940892ea27803038c5ec255174b2bb8fd13f06d Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 27 Aug 2025 14:53:53 +0200 Subject: [PATCH 47/56] Update sequence expectations --- icu4c/source/test/intltest/usettest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 841e94cc8030..8e4c2832e826 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -1971,13 +1971,13 @@ void UnicodeSetTest::TestLookupSymbolTable() { U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", - {u'0', u'-', u"one", u"one", u'1', u']'}, + {u'0', u'-', u"one", u'1', u']'}, {{u"zero", u"0"}, {u"one", u"1"}}}, {u"[$zero-$one]", U_ZERO_ERROR, u"[[a-z]-[bc]]", u"[ad-z]", - {u"zero", u"zero", u"zero", u"zero", u'0', u'-', u"one", u"one", u'1', u']'}, + {u"zero", u"zero", u'0', u'-', u"one", u'1', u']'}, {{u"zero", u"0"}, {u"one", u"1"}}}, // If the variable expands to multiple symbols, only the first one is sequenced right after // the variable lookup. @@ -1985,7 +1985,7 @@ void UnicodeSetTest::TestLookupSymbolTable() { U_ZERO_ERROR, u"[[bc][a-z]]", u"[a-z]", - {u"ten", u"ten", u"ten", u"ten", u'1', u'0', u']'}, + {u"ten", u"ten", u'1', u'0', u']'}, {{u"ten", u"10"}}}, // Substitution of lookupMatcher symbols takes place after unescaping. {uR"([!-\u0030])", U_MALFORMED_SET, u"[]", u"[]", {u'!', u'-', u'0'}}, From d3cc9eac5503ff58452095049a7eeed4efc9adc8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 2 Sep 2025 11:06:20 +0200 Subject: [PATCH 48/56] warnings --- icu4c/source/common/unicode/uniset.h | 2 -- icu4c/source/common/uniset_props.cpp | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index 538eb264e974..fd0a89a413fe 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1744,8 +1744,6 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { void parseElements(Lexer &lexer, UnicodeString &rebuiltPat, - UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, UErrorCode &ec); diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index f44d89feffd1..e88d09d867d0 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -347,7 +347,7 @@ class UnicodeSet::Lexer { pattern_.tempSubString(parsePosition_.getIndex(), 60); } - const bool acceptSetOperator(char16_t op) { + bool acceptSetOperator(char16_t op) { if (lookahead().isSetOperator(op)) { advance(); return true; @@ -355,7 +355,7 @@ class UnicodeSet::Lexer { return false; } - const bool acceptStandInWithSymbol(char16_t op) { + bool acceptStandInWithSymbol(char16_t op) { if (lookahead().standIn() != nullptr && lookahead().sourceText_ == std::u16string_view(&op, 1)) { advance(); return true; @@ -933,8 +933,6 @@ void UnicodeSet::parseRestriction(Lexer &lexer, void UnicodeSet::parseElements(Lexer &lexer, UnicodeString &rebuiltPat, - UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, UErrorCode &ec) { // Elements ::= Element // | Range From 3cfc4ae9a8f884ab06e81f2de961d5dd2c3bd989 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 2 Sep 2025 13:42:26 +0200 Subject: [PATCH 49/56] Clarify some comments --- icu4c/source/common/unicode/uniset.h | 8 +++++--- icu4c/source/common/uniset_props.cpp | 21 ++++++++++++++------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/icu4c/source/common/unicode/uniset.h b/icu4c/source/common/unicode/uniset.h index fd0a89a413fe..feacf399fb02 100644 --- a/icu4c/source/common/unicode/uniset.h +++ b/icu4c/source/common/unicode/uniset.h @@ -1705,10 +1705,12 @@ class U_COMMON_API UnicodeSet final : public UnicodeFilter { UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), UErrorCode &ec); - // Recursive descent parsing with no backtracking. These functions parse the syntactic categories + // Recursive-descent predictive parsing. These functions parse the syntactic categories // matching their name in the base grammar of PD UTR #56 (before the highlighted changes are - // applied). They add to *this the elements of the set that the parsed construct represents. - // https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. + // applied). + // See https://www.unicode.org/reports/tr61/tr61-1.html#Set-Operations. + // `parseUnicodeSet` clears `*this` and makes it represent the parsed UnicodeSet; all other functions + // add the set represented by the parsed construct to `*this`. class Lexer; diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index e88d09d867d0..23bf162535c7 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -761,7 +761,6 @@ void UnicodeSet::parseUnicodeSet(Lexer &lexer, addAll(propertyQuery); preserveSyntaxInPattern = true; } else { - // TODO(egg): In PD UTS 61, add ^ to set-operator, remove [^. // UnicodeSet ::= [ Union ] // | Complement ::= [ ^ Union ] // Extension: @@ -890,17 +889,25 @@ void UnicodeSet::parseRestriction(Lexer &lexer, UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), int32_t depth, UErrorCode &ec) { - // Restriction ::= UnicodeSet - // | Intersection ::= Restriction & UnicodeSet - // | Difference ::= Restriction - UnicodeSet + // Parse a https://www.unicode.org/reports/tr61/#Restriction: + // Restriction ::= UnicodeSet + // | Intersection + // | Difference + // Intersection ::= Restriction & UnicodeSet + // Difference ::= Restriction - UnicodeSet + // or, rewritten to be LL, + // Restriction ::= UnicodeSet RightHandSide + // RightHandSide ::= "" + // | & UnicodeSet RightHandSide + // | - UnicodeSet RightHandSide // Start by parsing the first UnicodeSet. UnicodeSet leftHandSide; leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); addAll(leftHandSide); U_UNICODESET_RETURN_IF_ERROR(ec); - // Now keep looking for an operator that would continue the Restriction. - // The loop terminates because when chars.atEnd(), op == DONE, so we go into the else branch and - // return. + // Now keep looking for an operator that would continue the RightHandSide. + // The loop terminates because when we run out of source text, the lookahead token will not be a set + // operator, so that we hit the else branch and return. for (;;) { if (lexer.acceptSetOperator(u'&')) { // Intersection ::= Restriction & UnicodeSet From 629bc8988006c65ffbe7deae7ebbb903cfb7c562 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 3 Sep 2025 02:25:07 +0200 Subject: [PATCH 50/56] more discursive comments --- icu4c/source/common/uniset_props.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 23bf162535c7..dc8f5a97e05e 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -896,10 +896,12 @@ void UnicodeSet::parseRestriction(Lexer &lexer, // Intersection ::= Restriction & UnicodeSet // Difference ::= Restriction - UnicodeSet // or, rewritten to be LL, - // Restriction ::= UnicodeSet RightHandSide - // RightHandSide ::= "" - // | & UnicodeSet RightHandSide - // | - UnicodeSet RightHandSide + // Restriction ::= UnicodeSet RightHandSides + // RightHandSides ::= "" + // | & UnicodeSet RightHandSides + // | - UnicodeSet RightHandSides + // but note that the tree resulting from this LL version is not an expression tree: the + // operations are left-associative. // Start by parsing the first UnicodeSet. UnicodeSet leftHandSide; leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); From cace9d71c0f7028f6fc7918ec3ad4be19a60ffae Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 3 Sep 2025 04:45:14 +0200 Subject: [PATCH 51/56] make it compile --- icu4c/source/common/uniset_props.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index dc8f5a97e05e..162876e6e5bd 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -878,7 +878,7 @@ void UnicodeSet::parseTerm(Lexer &lexer, parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } else { - parseElements(lexer, rebuiltPat, caseClosure, depth, ec); + parseElements(lexer, rebuiltPat, ec); U_UNICODESET_RETURN_IF_ERROR(ec); } } From 03235934848cf31d1171632d5989227f1b4f120b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 8 Sep 2025 13:38:16 +0200 Subject: [PATCH 52/56] libstdc++ dependencies --- icu4c/source/test/depstest/dependencies.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index deced67cff80..0278f1476894 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -141,6 +141,10 @@ group: cplusplus # "Calls the current terminate handler." std::terminate() + # From std::array::at in libstdc++. Note that we never call std::array::at, only operator[] + # which is noexcept. + std::__throw_out_of_range_fmt(char const*, ...) + group: iostream "std::basic_ios >::clear(std::_Ios_Iostate)" "std::basic_ios >::eof() const" From bcb7ac0c022b99b4a72b83d8f422703399991c69 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 8 Sep 2025 19:10:49 +0200 Subject: [PATCH 53/56] quote? --- icu4c/source/test/depstest/dependencies.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index 0278f1476894..c6f6f0c38ff2 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -143,7 +143,7 @@ group: cplusplus # From std::array::at in libstdc++. Note that we never call std::array::at, only operator[] # which is noexcept. - std::__throw_out_of_range_fmt(char const*, ...) + "std::__throw_out_of_range_fmt(char const*, ...)" group: iostream "std::basic_ios >::clear(std::_Ios_Iostate)" From 66cceeb6b024171270a94b367775b03cf8084972 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 9 Sep 2025 15:13:25 +0200 Subject: [PATCH 54/56] No infinite loops in the lexer --- icu4c/source/common/uniset_props.cpp | 8 +++---- icu4c/source/test/intltest/usettest.cpp | 31 +++++++++++++++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 162876e6e5bd..13dcd71ca8ce 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -453,7 +453,7 @@ class UnicodeSet::Lexer { RuleCharacterIterator::SKIP_WHITESPACE), unusedEscaped, errorCode); if (third == u'{') { - while (!chars_.atEnd()) { + while (!chars_.atEnd() && U_SUCCESS(errorCode)) { UChar32 last = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | RuleCharacterIterator::SKIP_WHITESPACE), @@ -495,7 +495,7 @@ class UnicodeSet::Lexer { RuleCharacterIterator::SKIP_WHITESPACE), unusedEscaped, errorCode); - while (!chars_.atEnd()) { + while (!chars_.atEnd() && U_SUCCESS(errorCode)) { const RuleCharacterIterator::Pos beforeLast = getPos(); UChar32 lastUnescaped = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | @@ -509,7 +509,7 @@ class UnicodeSet::Lexer { unusedEscaped, errorCode); bool namedElementOK = false; if (namedElementOpening == u'{') { - while (!chars_.atEnd()) { + while (!chars_.atEnd() && U_SUCCESS(errorCode)) { UChar32 namedElementLast = chars_.next( charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | RuleCharacterIterator::SKIP_WHITESPACE), @@ -600,7 +600,7 @@ class UnicodeSet::Lexer { UBool escaped; UChar32 next; int32_t codePointCount = 0; - while (!chars_.atEnd()) { + while (!chars_.atEnd() && U_SUCCESS(errorCode)) { next = chars_.next(charsOptions_, escaped, errorCode); if (!escaped && next == u'}') { return LexicalElement( diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 8e4c2832e826..37417dd8d4f1 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4715,6 +4715,19 @@ void UnicodeSetTest::TestToPatternOutput() { } void UnicodeSetTest::TestParseErrors() { + for (const auto expression : std::vector{ + uR"([\u])", + uR"([\x{}])", + uR"([\9])", + }) { + UErrorCode errorCode = U_ZERO_ERROR; + const UnicodeSet set(expression, errorCode); + if (errorCode != U_MALFORMED_UNICODE_ESCAPE) { + UnicodeString s; + errln(expression + u": Expected U_MALFORMED_UNICODE_ESCAPE, got " + u_errorName(errorCode) + + ", set is " + UnicodeSet(set).complement().complement().toPattern(s)); + } + } for (const auto expression : std::vector{ // Java error message: "Char expected after operator". u"[a-[b]]", @@ -4758,13 +4771,17 @@ void UnicodeSetTest::TestParseErrors() { } } for (const auto expression : std::vector{ - // Java error message: "Invalid property pattern". - u"[:]", - uR"(\p)" - u"[:^]", - uR"(\P)", - uR"(\N)", - }) { + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + uR"([\p{Some_Property=\u}])", + uR"([:Some_Property=\u:])", + uR"(\p{Some_Property=\N{SOME CHARACTER}})", + uR"([\N{}])", + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) { From f79b35c3ab559a6f57e9fb6574c14629d034dc5a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 9 Sep 2025 15:57:08 +0200 Subject: [PATCH 55/56] That is well-formed --- icu4c/source/test/intltest/usettest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 37417dd8d4f1..b34cd17985e7 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4718,7 +4718,6 @@ void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ uR"([\u])", uR"([\x{}])", - uR"([\9])", }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); From 94cc56c97b109404f39b2a213fc24c6895102843 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Thu, 11 Sep 2025 14:31:11 +0200 Subject: [PATCH 56/56] dedent --- icu4c/source/test/intltest/usettest.cpp | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index b34cd17985e7..052de84bf153 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4716,9 +4716,9 @@ void UnicodeSetTest::TestToPatternOutput() { void UnicodeSetTest::TestParseErrors() { for (const auto expression : std::vector{ - uR"([\u])", - uR"([\x{}])", - }) { + uR"([\u])", + uR"([\x{}])", + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); if (errorCode != U_MALFORMED_UNICODE_ESCAPE) { @@ -4770,17 +4770,17 @@ void UnicodeSetTest::TestParseErrors() { } } for (const auto expression : std::vector{ - // Java error message: "Invalid property pattern". - u"[:]", - uR"(\p)" - u"[:^]", - uR"(\P)", - uR"(\N)", - uR"([\p{Some_Property=\u}])", - uR"([:Some_Property=\u:])", - uR"(\p{Some_Property=\N{SOME CHARACTER}})", - uR"([\N{}])", - }) { + // Java error message: "Invalid property pattern". + u"[:]", + uR"(\p)" + u"[:^]", + uR"(\P)", + uR"(\N)", + uR"([\p{Some_Property=\u}])", + uR"([:Some_Property=\u:])", + uR"(\p{Some_Property=\N{SOME CHARACTER}})", + uR"([\N{}])", + }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode); if (errorCode != U_ILLEGAL_ARGUMENT_ERROR) {