unicode-org · eggrobin · Jan 21, 2026 · Jan 9, 2026 · Jan 16, 2026 · Jan 19, 2026
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
@@ -37,6 +37,7 @@
 #include "util.h"
 #include "uvector.h"
 #include "uprops.h"
+#include "patternprops.h"
 #include "propname.h"
 #include "normalizer2impl.h"
 #include "uinvchar.h"
@@ -538,9 +539,9 @@ class UnicodeSet::Lexer {
             UChar32 next;
             int32_t codePointCount = 0;
             while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
-                // TODO(egg): Propose making this space-sensitive.
                 const RuleCharacterIterator::Pos beforeNext = getPos();
-                next = chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES,
+                next = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
+                                                     RuleCharacterIterator::SKIP_WHITESPACE),
                                    unusedEscaped, errorCode);
                 if (next == u'\\') {
                     if (chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
@@ -554,7 +555,17 @@ class UnicodeSet::Lexer {
                         next = chars_.next(charsOptions_, escaped, errorCode);
                     }
                 } else {
-                  escaped = false;
+#if U_ICU_VERSION_MAJOR_NUM < 81
+                    if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
+                        // Transitional prohibition of unescaped spaces in string literals (in
+                        // ICU 78 and earlier, these were ignored; in ICU 81 they will mean
+                        // themselves).
+                        errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
+                    }
+#else
+#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
+#endif
+                    escaped = false;
                 }
                 if (!escaped && next == u'}') {
                     return LexicalElement(

diff --git a/icu4c/source/data/translit/Latin_NumericPinyin.txt b/icu4c/source/data/translit/Latin_NumericPinyin.txt
@@ -22,7 +22,7 @@
 $tone = [\u0304\u0301\u030C\u0300\u0306] ;
 # Move the tone to the end of a syllable, and convert to number
 e {($tone) r} → r &Pinyin-NumericPinyin($1);
-($tone) ( [i o n u {o n} {n g}]) → $2 &Pinyin-NumericPinyin($1);
+($tone) ( [i o n u {on} {ng}]) → $2 &Pinyin-NumericPinyin($1);
 ($tone) → &Pinyin-NumericPinyin($1);
 # The following backs up until it finds the right vowel, then deposits the tone
 $vowel = [aAeEiIoOuU {u\u0308} {U\u0308} vV];

diff --git a/icu4c/source/data/translit/blt_blt_FONIPA.txt b/icu4c/source/data/translit/blt_blt_FONIPA.txt
@@ -42,7 +42,7 @@ $C = [$LO $HI];
 $V1 = [ꪵ ꪶ ꪹ ꪻ ꪼ];  # vowels written before consonant
 $V2 = [ \uAAB0 \uAAB2 \uAAB3 \uAAB4 \uAAB7 \uAAB8 \uAABE];  # vowels written above or below consonant
 $V3 = [ꪱ ꪮ ꪺ ꪽ];  # vowels written after consonant
-$DIGRAPHS = [{ꪹ  \uAAB8} {ꪹ  \uAAB7} {ꪹ ꪱ}];
+$DIGRAPHS = [{ꪹ\uAAB8} {ꪹ\uAAB7} {ꪹꪱ}];
 $V12 = [$V1 $V2 $DIGRAPHS];
 $V123 = [$V12 $V3];
 $W = [ꪫ];  # labialization marker

diff --git a/icu4c/source/data/translit/de_ASCII.txt b/icu4c/source/data/translit/de_ASCII.txt
@@ -6,12 +6,12 @@
 # Generated from CLDR
 #
 
-$AE = [Ä {A \u0308}];
-$OE = [Ö {O \u0308}];
-$UE = [Ü {U \u0308}];
-[ä {a \u0308}] → ae;
-[ö {o \u0308}] → oe;
-[ü {u \u0308}] → ue;
+$AE = [Ä {A\u0308}];
+$OE = [Ö {O\u0308}];
+$UE = [Ü {U\u0308}];
+[ä {a\u0308}] → ae;
+[ö {o\u0308}] → oe;
+[ü {u\u0308}] → ue;
 {$AE} [:Lowercase:] → Ae;
 {$OE} [:Lowercase:] → Oe;
 {$UE} [:Lowercase:] → Ue;

diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
@@ -4633,8 +4633,6 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[ - - ]", uR"([\-])"},
             {u"[ - _ - ]", uR"([\-_])"},
             {u"[ - + - ]", uR"([+\-])"},
-            {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
-            {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
             {u"[$d-za-c]", uR"([\$a-z])"},
             {u"[a-c$d-z]", uR"([\$a-z])"},
             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
@@ -4658,11 +4656,6 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[^[c]]", uR"([^[c]])"},
             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
             {u"[$[]]", uR"([\$[]])"},
-            // Spaces are eliminated within a string-literal even when the syntax is preserved.
-            {u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
-            // Escapes are removed even when the syntax is preserved.
-            {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
-            u"[{Zeichenkette}[]Zeichenmenge]"},
             // In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
             // caused the syntax to be preserved.  Now it is treated like an escape.
             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
@@ -4744,6 +4737,13 @@ void UnicodeSetTest::TestParseErrors() {
             // TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
             u"[a-{z}]",
             u"[{a}-z]",
+            // Well-formed in ICU 78 and earlier (spaces ignored).
+            // In ICU 81 and later, the spaces will mean spaces.
+            // Ill-formed in ICU 79 and 80.
+            u"[ { Z e i c h e n k e t t e } Zeichenmenge ]",
+            uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])",
+            u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]",
+            uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);