@@ -1962,42 +1962,42 @@ void UnicodeSetTest::TestLookupSymbolTable() {
19621962 for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
19631963 expectedLookups, variables] : std::vector<TestCase>{
19641964 {u" 0" , U_ZERO_ERROR, u" [a-z]" , u" [a-z]" , {u' 0' }},
1965- {u" [0-1]" , U_ZERO_ERROR, u" [[a-z]-[bc]]" , u" [ad-z]" , {u' 0' , u' - ' , u ' 1 ' , u ' ] ' }},
1966- {u" [!-0]" , U_MALFORMED_SET, u" []" , u" []" , {u' !' , u' - ' , u ' 0' }},
1965+ {u" [0-1]" , U_ZERO_ERROR, u" [[a-z]-[bc]]" , u" [ad-z]" , {u' 0' , u' 1 ' }},
1966+ {u" [!-0]" , U_MALFORMED_SET, u" []" , u" []" , {u' !' , u' 0' }},
19671967 // A call to lookupMatcher with the first character of the content of a variable happens
19681968 // immediately after a corresponding call to lookup, although we may lookup the variable
19691969 // several times before we call lookupMatcher.
19701970 {u" [0-$one]" ,
19711971 U_ZERO_ERROR,
19721972 u" [[a-z]-[bc]]" ,
19731973 u" [ad-z]" ,
1974- {u' 0' , u' - ' , u " one" , u' 1' , u ' ] ' },
1974+ {u' 0' , u" one" , u' 1' },
19751975 {{u" zero" , u" 0" }, {u" one" , u" 1" }}},
19761976 {u" [$zero-$one]" ,
19771977 U_ZERO_ERROR,
19781978 u" [[a-z]-[bc]]" ,
19791979 u" [ad-z]" ,
1980- {u" zero" , u" zero" , u' 0' , u' - ' , u " one" , u' 1' , u ' ] ' },
1980+ {u" zero" , u" zero" , u' 0' , u" one" , u' 1' },
19811981 {{u" zero" , u" 0" }, {u" one" , u" 1" }}},
19821982 // If the variable expands to multiple symbols, only the first one is sequenced right after
19831983 // the variable lookup.
19841984 {u" [$ten]" ,
19851985 U_ZERO_ERROR,
19861986 u" [[bc][a-z]]" ,
19871987 u" [a-z]" ,
1988- {u" ten" , u" ten" , u' 1' , u' 0' , u ' ] ' },
1988+ {u" ten" , u" ten" , u' 1' , u' 0' },
19891989 {{u" ten" , u" 10" }}},
19901990 // Substitution of lookupMatcher symbols takes place after unescaping.
1991- {uR"( [!-\u0030])" , U_MALFORMED_SET, u" []" , u" []" , {u' !' , u' - ' , u ' 0' }},
1991+ {uR"( [!-\u0030])" , U_MALFORMED_SET, u" []" , u" []" , {u' !' , u' 0' }},
19921992 // It does not take place in string literals.
1993- {uR"( [!-/{0}])" , U_ZERO_ERROR, u" [!-0]" , u" [!-0]" , {u' !' , u' - ' , u ' / ' , u ' { ' , u ' ] ' }},
1994- {uR"( [ 2 & 1 ])" , U_ZERO_ERROR, u" [[: Co :]&[bc]]" , u" []" , {u' 2' , u' & ' , u ' 1 ' , u ' ] ' }},
1993+ {uR"( [!-/{0}])" , U_ZERO_ERROR, u" [!-0]" , u" [!-0]" , {u' !' , u' / ' }},
1994+ {uR"( [ 2 & 1 ])" , U_ZERO_ERROR, u" [[: Co :]&[bc]]" , u" []" , {u' 2' , u' 1 ' }},
19951995 {uR"( [ 21 ])" ,
19961996 U_ZERO_ERROR,
19971997 u" [[: Co :][bc]]" ,
19981998 u" [bc\uE000 -\uF8FF\U000F0000 -\U000FFFFD\U00100000 -\U0010FFFD ]" ,
1999- {u' 2' , u' 1' , u ' ] ' }},
2000- {u" [ a-b 1 ]" , U_ZERO_ERROR, u" [a-b[bc]]" , u" [a-c]" , {u' a' , u' - ' , u ' b' , u' 1' , u ' ] ' }},
1999+ {u' 2' , u' 1' }},
2000+ {u" [ a-b 1 ]" , U_ZERO_ERROR, u" [a-b[bc]]" , u" [a-c]" , {u' a' , u' b' , u' 1' }},
20012001 }) {
20022002 symbols.setVariables (variables);
20032003 symbols.clearLookupTrace ();
@@ -2033,13 +2033,12 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20332033 errln (u" Unexpected sequence of lookups:\n Expected : " + expected + " \n Actual : " + actual);
20342034 }
20352035 }
2036- // Test what happens when we define syntax characters as symbols. It is an extraordinarily bad idea
2037- // to rely on this behaviour, but since it has been around since ICU 2.8, we probably should not
2038- // change it unknowingly.
2036+ // Defining syntax characters as symbols has no effect on syntax.
20392037 symbols.add (u' -' , UnicodeSet (u" [{hyphenMinus}]" , errorCode));
20402038 symbols.add (u' &' , UnicodeSet (u" [{ampersand}]" , errorCode));
20412039 // This one is never used, except if escaped.
20422040 symbols.add (u' [' , UnicodeSet (u" [{leftSquareBracket}]" , errorCode));
2041+ symbols.add (u' ]' , UnicodeSet (u" [{rightSquareBracket}]" , errorCode));
20432042 symbols.add (u' ^' , UnicodeSet (u" [{circumflexAccent}]" , errorCode));
20442043 symbols.add (u' {' , UnicodeSet (u" [{leftCurlyBracket}]" , errorCode));
20452044 symbols.add (u' }' , UnicodeSet (u" [{rightCurlyBracket}]" , errorCode));
@@ -2049,35 +2048,22 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20492048 symbols.add (u' p' , UnicodeSet (u" [{latinSmallLetterP}]" , errorCode));
20502049 for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
20512050 expectedLookups, variables] : std::vector<TestCase>{
2052- {u" -" , U_ZERO_ERROR , u" [{hyphenMinus} ]" , u" [{hyphenMinus} ]" },
2051+ {u" -" , U_MALFORMED_SET , u" []" , u" []" },
20532052 {u" 0" , U_ZERO_ERROR, u" [a-z]" , u" [a-z]" },
2054- // The hyphen no longer works as set difference.
2055- {u" [0-1]" , U_ZERO_ERROR, u" [[a-z][{hyphenMinus}][bc]]" , u" [a-z{hyphenMinus}]" },
2056- {u" [!-0]" , U_ZERO_ERROR, u" [![{hyphenMinus}][a-z]]" , u" [!a-z{hyphenMinus}]" },
2057- // An initial HYPHEN-MINUS is still treated as a literal '-', but a final one is treated
2058- // as a set.
2053+ {u" [0-1]" , U_ZERO_ERROR, u" [[a-z]-[bc]]" , u" [ad-z]" },
2054+ {u" [!-0]" , U_MALFORMED_SET, u" []" , u" []" },
20592055 {u" [-1]" , U_ZERO_ERROR, uR"( [\-[bc]])" , uR"( [\-bc])" },
2060- {u" [1-]" , U_ZERO_ERROR, u" [[bc][{hyphenMinus}]]" , u" [bc{hyphenMinus}]" },
2061- // String literals no longer work.
2062- {uR"( [!-/{0}])" , U_ZERO_ERROR,
2063- u" [![{hyphenMinus}]/[{leftCurlyBracket}][a-z][{rightCurlyBracket}]]" ,
2064- u" [!/a-z{hyphenMinus}{leftCurlyBracket}{rightCurlyBracket}]" },
2065- // The ampersand no longer works as set intersection.
2066- {uR"( [ 2 & 1 ])" , U_ZERO_ERROR, u" [[: Co :][{ampersand}][bc]]" ,
2067- u" [bc---{ampersand}]" },
2068- // Complementing still works.
2056+ {u" [1-]" , U_ZERO_ERROR, u" [[bc]-]" , uR"( [\-bc])" },
2057+ {uR"( [!-/{0}])" , U_ZERO_ERROR, u" [!-0]" , u" [!-0]" },
2058+ {uR"( [ 2 & 1 ])" , U_ZERO_ERROR, u" [[: Co :]&[bc]]" , u" []" },
20692059 {uR"( [^ \u0000 ])" , U_ZERO_ERROR, uR"( [\u0001-\U0010FFFF])" ,
2070- uR"( [\u0001-\U0010FFFF])" },
2071- // ^ elsewhere becomes a symbol rather than a syntax error.
2072- {uR"( [\u0000 ^ -])" , U_ZERO_ERROR, uR"( [\u0000[{circumflexAccent}][{hyphenMinus}]])" ,
2073- uR"( [\u0000{circumflexAccent}{hyphenMinus}])" },
2074- // Opening brackets still work.
2075- {uR"( [^ [ [^] ] ])" , U_ZERO_ERROR, uR"( [^[[\u0000-\U0010FFFF]]])" , uR"( [])" },
2076- // The only way to access the [ symbol is via escaping.
2060+ uR"( [\u0001-\U0010FFFF])" },
2061+ {uR"( [\u0000 ^ -])" , U_MALFORMED_SET, uR"( [\u0000])" , uR"( [\u0000])" },
2062+ {uR"( [^ [ [^] ] ])" , U_ZERO_ERROR, uR"( [^[[\u0000-\U0010FFFF]]])" , u" []" },
2063+ // An escape can access any mapped character, even if the unescaped
2064+ // character would be syntax.
20772065 {uR"( [ \[ ])" , U_ZERO_ERROR, uR"( [[{leftSquareBracket}]])" , uR"( [{leftSquareBracket}])" },
2078- // Anchors are gone.
2079- {uR"( [$])" , U_ZERO_ERROR, uR"( [[{dollarSign}]])" , uR"( [{dollarSign}])" },
2080- // Property queries are unaffected.
2066+ {uR"( [$])" , U_ZERO_ERROR, uR"( [$])" , uR"( [\uFFFF])" },
20812067 {u" [:Co:]" , U_ZERO_ERROR, u" [:Co:]" , u" [\uE000 -\uF8FF\U000F0000 -\U000FFFFD\U00100000 -\U0010FFFD ]" },
20822068 {uR"( \p{Co})" , U_ZERO_ERROR, uR"( \p{Co})" , u" [\uE000 -\uF8FF\U000F0000 -\U000FFFFD\U00100000 -\U0010FFFD ]" },
20832069 }) {
@@ -2098,32 +2084,6 @@ void UnicodeSetTest::TestLookupSymbolTable() {
20982084 " , got " + actual);
20992085 }
21002086 }
2101- // If ] is defined as a symbol, everything breaks except a lone symbol or property-query, and the
2102- // constructor returns an error but not an empty set. Don’t do that.
2103- symbols.add (u' ]' , UnicodeSet (u" [{rightSquareBracket}]" , errorCode));
2104- for (const auto &[expression, expectedErrorCode, expectedPattern, expectedRegeneratedPattern,
2105- expectedLookups, variables] : std::vector<TestCase>{
2106- {u" ]" , U_ZERO_ERROR, u" [{rightSquareBracket}]" , u" [{rightSquareBracket}]" },
2107- {u" [:Co:]" , U_ZERO_ERROR, u" [:Co:]" , u" [\uE000 -\uF8FF\U000F0000 -\U000FFFFD\U00100000 -\U0010FFFD ]" },
2108- {u" []" , U_MALFORMED_SET, u" [{rightSquareBracket}]" , u" [{rightSquareBracket}]" },
2109- }) {
2110- UnicodeString actual;
2111- UErrorCode errorCode = U_ZERO_ERROR;
2112- const UnicodeSet set (expression, USET_IGNORE_SPACE, &symbols, errorCode);
2113- if (errorCode != expectedErrorCode) {
2114- errln (u" Parsing " + expression + u" : Expected " + u_errorName (expectedErrorCode) + " , got " +
2115- u_errorName (errorCode));
2116- }
2117- if (set.toPattern (actual) != expectedPattern) {
2118- errln (u" UnicodeSet(R\" (" + expression + u" )\" ).toPattern() expected " + expectedPattern +
2119- " , got " + actual);
2120- }
2121- if (UnicodeSet (set).complement ().complement ().toPattern (actual) != expectedRegeneratedPattern) {
2122- errln (u" UnicodeSet(R\" (" + expression +
2123- u" )\" ).complement().complement().toPattern() expected " + expectedRegeneratedPattern +
2124- " , got " + actual);
2125- }
2126- }
21272087#pragma GCC diagnostic pop
21282088}
21292089
0 commit comments