Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions icu4c/source/common/uniset_props.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "util.h"
#include "uvector.h"
#include "uprops.h"
#include "patternprops.h"
#include "propname.h"
#include "normalizer2impl.h"
#include "uinvchar.h"
Expand Down Expand Up @@ -538,9 +539,9 @@ class UnicodeSet::Lexer {
UChar32 next;
int32_t codePointCount = 0;
while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
// TODO(egg): Propose making this space-sensitive.
const RuleCharacterIterator::Pos beforeNext = getPos();
next = chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES,
next = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
RuleCharacterIterator::SKIP_WHITESPACE),
unusedEscaped, errorCode);
if (next == u'\\') {
if (chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
Expand All @@ -554,7 +555,17 @@ class UnicodeSet::Lexer {
next = chars_.next(charsOptions_, escaped, errorCode);
}
} else {
escaped = false;
#if U_ICU_VERSION_MAJOR_NUM < 81
if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
// Transitional prohibition of unescaped spaces in string literals (in
// ICU 78 and earlier, these were ignored; in ICU 81 they will mean
// themselves).
errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
}
#else
#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
#endif
escaped = false;
}
if (!escaped && next == u'}') {
return LexicalElement(
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/translit/Latin_NumericPinyin.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
$tone = [\u0304\u0301\u030C\u0300\u0306] ;
# Move the tone to the end of a syllable, and convert to number
e {($tone) r} → r &Pinyin-NumericPinyin($1);
($tone) ( [i o n u {o n} {n g}]) → $2 &Pinyin-NumericPinyin($1);
($tone) ( [i o n u {on} {ng}]) → $2 &Pinyin-NumericPinyin($1);
($tone) → &Pinyin-NumericPinyin($1);
# The following backs up until it finds the right vowel, then deposits the tone
$vowel = [aAeEiIoOuU {u\u0308} {U\u0308} vV];
Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/data/translit/blt_blt_FONIPA.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ $C = [$LO $HI];
$V1 = [ꪵ ꪶ ꪹ ꪻ ꪼ]; # vowels written before consonant
$V2 = [ \uAAB0 \uAAB2 \uAAB3 \uAAB4 \uAAB7 \uAAB8 \uAABE]; # vowels written above or below consonant
$V3 = [ꪱ ꪮ ꪺ ꪽ]; # vowels written after consonant
$DIGRAPHS = [{ꪹ \uAAB8} {ꪹ \uAAB7} {ꪹ ꪱ}];
$DIGRAPHS = [{ꪹ\uAAB8} {ꪹ\uAAB7} {ꪹꪱ}];
$V12 = [$V1 $V2 $DIGRAPHS];
$V123 = [$V12 $V3];
$W = [ꪫ]; # labialization marker
Expand Down
12 changes: 6 additions & 6 deletions icu4c/source/data/translit/de_ASCII.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
# Generated from CLDR
#

$AE = [Ä {A \u0308}];
$OE = [Ö {O \u0308}];
$UE = [Ü {U \u0308}];
[ä {a \u0308}] → ae;
[ö {o \u0308}] → oe;
[ü {u \u0308}] → ue;
$AE = [Ä {A\u0308}];
$OE = [Ö {O\u0308}];
$UE = [Ü {U\u0308}];
[ä {a\u0308}] → ae;
[ö {o\u0308}] → oe;
[ü {u\u0308}] → ue;
{$AE} [:Lowercase:] → Ae;
{$OE} [:Lowercase:] → Oe;
{$UE} [:Lowercase:] → Ue;
Expand Down
14 changes: 7 additions & 7 deletions icu4c/source/test/intltest/usettest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4633,8 +4633,6 @@ void UnicodeSetTest::TestToPatternOutput() {
{u"[ - - ]", uR"([\-])"},
{u"[ - _ - ]", uR"([\-_])"},
{u"[ - + - ]", uR"([+\-])"},
{u"[ { Z e i c h e n k e t t e } Zeichenmenge ]", u"[Zceg-imn{Zeichenkette}]"},
{uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])", u"[Zceg-imn{Zeichenkette}]"},
{u"[$d-za-c]", uR"([\$a-z])"},
{u"[a-c$d-z]", uR"([\$a-z])"},
{uR"([\uFFFFa-z])", uR"([a-z\uFFFF])"},
Expand All @@ -4658,11 +4656,6 @@ void UnicodeSetTest::TestToPatternOutput() {
{u"[^[c]]", uR"([^[c]])"},
{uR"([ ^ [ \u0000-b d-\U0010FFFF ] ])", uR"([^[^c]])"},
{u"[$[]]", uR"([\$[]])"},
// Spaces are eliminated within a string-literal even when the syntax is preserved.
{u"[ {Z e i c h e n k e t t e } [] Zeichenmenge ]", u"[{Zeichenkette}[]Zeichenmenge]"},
// Escapes are removed even when the syntax is preserved.
{uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
u"[{Zeichenkette}[]Zeichenmenge]"},
// In ICU 78 and earlier, a named-element was a nested set, so it was preserved and
// caused the syntax to be preserved. Now it is treated like an escape.
{uR"([ \N{LATIN CAPITAL LETTER Z}eichenmenge ])", uR"([Zceg-imn])"},
Expand Down Expand Up @@ -4744,6 +4737,13 @@ void UnicodeSetTest::TestParseErrors() {
// TODO(egg): Well-formed in Java, ill-formed in ICU4C in ICU 78 and earlier.
u"[a-{z}]",
u"[{a}-z]",
// Well-formed in ICU 78 and earlier (spaces ignored).
// In ICU 81 and later, the spaces will mean spaces.
// Ill-formed in ICU 79 and 80.
u"[ { Z e i c h e n k e t t e } Zeichenmenge ]",
uR"([ { \x5A e i c h e n k e t t e } \x5Aeichenmenge ])",
u"[ { Z e i c h e n k e t t e } [] Zeichenmenge ]",
uR"([ { \x5A e i c h e n k e t t e } [] \x5Aeichenmenge ])",
}) {
UErrorCode errorCode = U_ZERO_ERROR;
const UnicodeSet set(expression, errorCode);
Expand Down