Skip to content

Commit 5cf7fe0

Browse files
committed
ICU-23307 Transitionally forbid unescaped spaces in string literals
1 parent afa04f1 commit 5cf7fe0

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "util.h"
3838
#include "uvector.h"
3939
#include "uprops.h"
40+
#include "patternprops.h"
4041
#include "propname.h"
4142
#include "normalizer2impl.h"
4243
#include "uinvchar.h"
@@ -563,7 +564,17 @@ class UnicodeSet::Lexer {
563564
next = chars_.next(charsOptions_, escaped, errorCode);
564565
}
565566
} else {
566-
escaped = false;
567+
#if U_ICU_VERSION_MAJOR_NUM < 81
568+
if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
569+
// Transitional prohibition of unescaped spaces in string literals (in
570+
// ICU 78 and earlier, these were ignored; in ICU 81 they will mean
571+
// themselves).
572+
errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
573+
}
574+
#else
575+
#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
576+
#endif
577+
escaped = false;
567578
}
568579
if (!escaped && next == u'}') {
569580
return LexicalElement(

icu4c/source/test/intltest/usettest.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4638,8 +4638,6 @@ void UnicodeSetTest::TestToPatternOutput() {
46384638
{u"[ - - ]", uR"([\-])"},
46394639
{u"[ - _ - ]", uR"([\-_])"},
46404640
{u"[ - + - ]", uR"([+\-])"},
4641-
{u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
4642-
{uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
46434641
{u"[$d-za-c]", uR"([\$a-z])"},
46444642
{u"[a-c$d-z]", uR"([\$a-z])"},
46454643
{uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
@@ -4663,11 +4661,6 @@ void UnicodeSetTest::TestToPatternOutput() {
46634661
{u"[^[c]]", uR"([^[c]])"},
46644662
{uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
46654663
{u"[$[]]", uR"([\$[]])"},
4666-
// Spaces are eliminated within a string-literal even when the syntax is preserved.
4667-
{u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]", uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
4668-
// Escapes are removed even when the syntax is preserved.
4669-
{uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
4670-
uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
46714664
// In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
46724665
// caused the syntax to be preserved. Now it is treated like an escape.
46734666
{uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
@@ -4753,6 +4746,13 @@ void UnicodeSetTest::TestParseErrors() {
47534746
// TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
47544747
u"[a-{z}]",
47554748
u"[{a}-z]",
4749+
// Well-formed in ICU 78 and earlier (spaces ignored).
4750+
// In ICU 81 and later, the spaces will mean spaces.
4751+
// Ill-formed in ICU 79 and 80.
4752+
u"[ { Z e i c h e n k e t t e } Zeichenmenge ]",
4753+
uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])",
4754+
u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]",
4755+
uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
47564756
}) {
47574757
UErrorCode errorCode = U_ZERO_ERROR;
47584758
const UnicodeSet set(expression, errorCode);

0 commit comments

Comments
 (0)