diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 570a81f36a32..2a1edfa47cb1 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -37,6 +37,7 @@ #include "util.h" #include "uvector.h" #include "uprops.h" +#include "patternprops.h" #include "propname.h" #include "normalizer2impl.h" #include "uinvchar.h" @@ -538,9 +539,9 @@ class UnicodeSet::Lexer { UChar32 next; int32_t codePointCount = 0; while (!chars_.atEnd() && U_SUCCESS(errorCode)) { - // TODO(egg): Propose making this space-sensitive. const RuleCharacterIterator::Pos beforeNext = getPos(); - next = chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, + next = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | + RuleCharacterIterator::SKIP_WHITESPACE), unusedEscaped, errorCode); if (next == u'\\') { if (chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | @@ -554,7 +555,17 @@ class UnicodeSet::Lexer { next = chars_.next(charsOptions_, escaped, errorCode); } } else { - escaped = false; +#if U_ICU_VERSION_MAJOR_NUM < 81 + if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) { + // Transitional prohibition of unescaped spaces in string literals (in + // ICU 78 and earlier, these were ignored; in ICU 81 they will mean + // themselves). + errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR; + } +#else +#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16. +#endif + escaped = false; } if (!escaped && next == u'}') { return LexicalElement( diff --git a/icu4c/source/data/translit/Latin_NumericPinyin.txt b/icu4c/source/data/translit/Latin_NumericPinyin.txt index a78bb564655b..a0eabd8e4980 100644 --- a/icu4c/source/data/translit/Latin_NumericPinyin.txt +++ b/icu4c/source/data/translit/Latin_NumericPinyin.txt @@ -22,7 +22,7 @@ $tone = [\u0304\u0301\u030C\u0300\u0306] ; # Move the tone to the end of a syllable, and convert to number e {($tone) r} → r &Pinyin-NumericPinyin($1); -($tone) ( [i o n u {o n} {n g}]) → $2 &Pinyin-NumericPinyin($1); +($tone) ( [i o n u {on} {ng}]) → $2 &Pinyin-NumericPinyin($1); ($tone) → &Pinyin-NumericPinyin($1); # The following backs up until it finds the right vowel, then deposits the tone $vowel = [aAeEiIoOuU {u\u0308} {U\u0308} vV]; diff --git a/icu4c/source/data/translit/blt_blt_FONIPA.txt b/icu4c/source/data/translit/blt_blt_FONIPA.txt index e9e30d709534..c4fef545e32f 100644 --- a/icu4c/source/data/translit/blt_blt_FONIPA.txt +++ b/icu4c/source/data/translit/blt_blt_FONIPA.txt @@ -42,7 +42,7 @@ $C = [$LO $HI]; $V1 = [ꪵ ꪶ ꪹ ꪻ ꪼ]; # vowels written before consonant $V2 = [ \uAAB0 \uAAB2 \uAAB3 \uAAB4 \uAAB7 \uAAB8 \uAABE]; # vowels written above or below consonant $V3 = [ꪱ ꪮ ꪺ ꪽ]; # vowels written after consonant -$DIGRAPHS = [{ꪹ \uAAB8} {ꪹ \uAAB7} {ꪹ ꪱ}]; +$DIGRAPHS = [{ꪹ\uAAB8} {ꪹ\uAAB7} {ꪹꪱ}]; $V12 = [$V1 $V2 $DIGRAPHS]; $V123 = [$V12 $V3]; $W = [ꪫ]; # labialization marker diff --git a/icu4c/source/data/translit/de_ASCII.txt b/icu4c/source/data/translit/de_ASCII.txt index 39ba00e92752..dbf31c437318 100644 --- a/icu4c/source/data/translit/de_ASCII.txt +++ b/icu4c/source/data/translit/de_ASCII.txt @@ -6,12 +6,12 @@ # Generated from CLDR # -$AE = [Ä {A \u0308}]; -$OE = [Ö {O \u0308}]; -$UE = [Ü {U \u0308}]; -[ä {a \u0308}] → ae; -[ö {o \u0308}] → oe; -[ü {u \u0308}] → ue; +$AE = [Ä {A\u0308}]; +$OE = [Ö {O\u0308}]; +$UE = [Ü {U\u0308}]; +[ä {a\u0308}] → ae; +[ö {o\u0308}] → oe; +[ü {u\u0308}] → ue; {$AE} [:Lowercase:] → Ae; {$OE} [:Lowercase:] → Oe; {$UE} [:Lowercase:] → Ue; diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index cfe544fcd40c..81e16bd42259 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -4633,8 +4633,6 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[ - - ]", uR"([\-])"}, {u"[ - _ - ]", uR"([\-_])"}, {u"[ - + - ]", uR"([+\-])"}, - {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"}, - {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"}, {u"[$d-za-c]", uR"([\$a-z])"}, {u"[a-c$d-z]", uR"([\$a-z])"}, {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"}, @@ -4658,11 +4656,6 @@ void UnicodeSetTest::TestToPatternOutput() { {u"[^[c]]", uR"([^[c]])"}, {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"}, {u"[$[]]", uR"([\$[]])"}, - // Spaces are eliminated within a string-literal even when the syntax is preserved. - {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"}, - // Escapes are removed even when the syntax is preserved. - {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", - u"[{Zeichenkette}[]Zeichenmenge]"}, // In ICU 78 and earlier, a named-element was a nested set, so it was preserved and // caused the syntax to be preserved. Now it is treated like an escape. {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"}, @@ -4744,6 +4737,13 @@ void UnicodeSetTest::TestParseErrors() { // TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier. u"[a-{z}]", u"[{a}-z]", + // Well-formed in ICU 78 and earlier (spaces ignored). + // In ICU 81 and later, the spaces will mean spaces. + // Ill-formed in ICU 79 and 80. + u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", + uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", + u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]", + uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])", }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode);