Skip to content

Commit 0b09ebb

Browse files
committed
ICU-23307 Transitionally forbid unescaped spaces in string literals
1 parent 4e9c96d commit 0b09ebb

File tree

2 files changed

+19
-8
lines changed

2 files changed

+19
-8
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "util.h"
3838
#include "uvector.h"
3939
#include "uprops.h"
40+
#include "patternprops.h"
4041
#include "propname.h"
4142
#include "normalizer2impl.h"
4243
#include "uinvchar.h"
@@ -554,7 +555,17 @@ class UnicodeSet::Lexer {
554555
next = chars_.next(charsOptions_, escaped, errorCode);
555556
}
556557
} else {
557-
escaped = false;
558+
#if U_ICU_VERSION_MAJOR_NUM < 81
559+
if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
560+
// Transitional prohibition of unescaped spaces in string literals (in
561+
// ICU 78 and earlier, these were ignored; in ICU 81 they will mean
562+
// themselves).
563+
errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
564+
}
565+
#else
566+
#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
567+
#endif
568+
escaped = false;
558569
}
559570
if (!escaped && next == u'}') {
560571
return LexicalElement(

icu4c/source/test/intltest/usettest.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4633,8 +4633,6 @@ void UnicodeSetTest::TestToPatternOutput() {
46334633
{u"[ - - ]", uR"([\-])"},
46344634
{u"[ - _ - ]", uR"([\-_])"},
46354635
{u"[ - + - ]", uR"([+\-])"},
4636-
{u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
4637-
{uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", uR"([Zceg-imn{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }])"},
46384636
{u"[$d-za-c]", uR"([\$a-z])"},
46394637
{u"[a-c$d-z]", uR"([\$a-z])"},
46404638
{uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
@@ -4658,11 +4656,6 @@ void UnicodeSetTest::TestToPatternOutput() {
46584656
{u"[^[c]]", uR"([^[c]])"},
46594657
{uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
46604658
{u"[$[]]", uR"([\$[]])"},
4661-
// Spaces are eliminated within a string-literal even when the syntax is preserved.
4662-
{u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]", uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
4663-
// Escapes are removed even when the syntax is preserved.
4664-
{uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
4665-
uR"([{\ Z\ e\ i\ c\ h\ e\ n\ k\ e\ t\ t\ e\ }[]Zeichenmenge])"},
46664659
// In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
46674660
// caused the syntax to be preserved. Now it is treated like an escape.
46684661
{uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
@@ -4744,6 +4737,13 @@ void UnicodeSetTest::TestParseErrors() {
47444737
// TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
47454738
u"[a-{z}]",
47464739
u"[{a}-z]",
4740+
// Well-formed in ICU 78 and earlier (spaces ignored).
4741+
// In ICU 81 and later, the spaces will mean spaces.
4742+
// Ill-formed in ICU 79 and 80.
4743+
u"[ { Z e i c h e n k e t t e } Zeichenmenge ]",
4744+
uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])",
4745+
u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]",
4746+
uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
47474747
}) {
47484748
UErrorCode errorCode = U_ZERO_ERROR;
47494749
const UnicodeSet set(expression, errorCode);

0 commit comments

Comments
 (0)