ICU-23307 Transitionally forbid unescaped spaces in string literals

eggrobin · eggrobin · commit 5cf7fe0383cd · 2026-01-21T11:42:53.000+01:00
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
@@ -37,6 +37,7 @@
 #include "util.h"
 #include "uvector.h"
 #include "uprops.h"
+#include "patternprops.h"
 #include "propname.h"
 #include "normalizer2impl.h"
 #include "uinvchar.h"
@@ -563,7 +564,17 @@ class UnicodeSet::Lexer {
                         next = chars_.next(charsOptions_, escaped, errorCode);
                     }
                 } else {
-                  escaped = false;
+#if U_ICU_VERSION_MAJOR_NUM < 81
+                    if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
+                        // Transitional prohibition of unescaped spaces in string literals (in
+                        // ICU 78 and earlier, these were ignored; in ICU 81 they will mean
+                        // themselves).
+                        errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
+                    }
+#else
+#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
+#endif
+                    escaped = false;
                 }
                 if (!escaped && next == u'}') {
                     return LexicalElement(
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
@@ -4638,8 +4638,6 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[ - - ]", uR"([\-])"},
             {u"[ - _ - ]", uR"([\-_])"},
             {u"[ - + - ]", uR"([+\-])"},
-            {u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
-            {uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
             {u"[$d-za-c]", uR"([\$a-z])"},
             {u"[a-c$d-z]", uR"([\$a-z])"},
             {uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
@@ -4663,11 +4661,6 @@ void UnicodeSetTest::TestToPatternOutput() {
             {u"[^[c]]", uR"([^[c]])"},
             {uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
             {u"[$[]]", uR"([\$[]])"},
-            // Spaces are eliminated within a string-literal even when the syntax is preserved.
-            {u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]", uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
-            // Escapes are removed even when the syntax is preserved.
-            {uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
-            uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
             // In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
             // caused the syntax to be preserved.  Now it is treated like an escape.
             {uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
@@ -4753,6 +4746,13 @@ void UnicodeSetTest::TestParseErrors() {
             // TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
             u"[a-{z}]",
             u"[{a}-z]",
+            // Well-formed in ICU 78 and earlier (spaces ignored).
+            // In ICU 81 and later, the spaces will mean spaces.
+            // Ill-formed in ICU 79 and 80.
+            u"[ { Z e i c h e n k e t t e } Zeichenmenge ]",
+            uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])",
+            u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]",
+            uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);