Skip to content

Commit 5df5359

Browse files
committed
Support name aliases of type correction in UnicodeSet \N
1 parent 1659052 commit 5df5359

File tree

2 files changed

+74
-41
lines changed

2 files changed

+74
-41
lines changed

icu4c/source/common/uniset_props.cpp

Lines changed: 64 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
5959
constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
6060
constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
6161

62-
// Unicode name property alias
63-
constexpr char16_t NAME_PROP[] = u"na";
64-
6562
} // namespace
6663

6764
// Cached sets ------------------------------------------------------------- ***
@@ -147,6 +144,56 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
147144
// memory leak checker tools
148145
#define _dbgct(me)
149146

147+
// Strips leading and trailing spaces and turns runs of spaces into single spaces.
148+
// This should be replaced by UAX44-LM1 and UAX44-LM2 skeletonizations as part of ICU-3736.
149+
template<typename CharT>
150+
UBool mungeCharName(std::basic_string_view<CharT> src, char* dst, int32_t dstCapacity) {
151+
int32_t j = 0;
152+
--dstCapacity; /* make room for term. zero */
153+
if constexpr (!std::is_same_v<CharT, char>) {
154+
if (!uprv_isInvariantUString(src.data(), static_cast<int32_t>(src.size()))) {
155+
return false;
156+
}
157+
}
158+
for (CharT uch : src) {
159+
char ch;
160+
if constexpr (std::is_same_v<CharT, char>) {
161+
ch = uch;
162+
} else {
163+
// This would want to be UCHAR_TO_CHAR but that is defined in uinvchar.cpp. This function
164+
// should not last long anyway (famous last words)…
165+
u_UCharsToChars(&uch, &ch, 1);
166+
}
167+
if (ch == ' ' && (j == 0 || (j > 0 && dst[j - 1] == ' '))) {
168+
continue;
169+
}
170+
if (j >= dstCapacity) return false;
171+
dst[j++] = ch;
172+
}
173+
if (j > 0 && dst[j-1] == ' ') --j;
174+
dst[j] = 0;
175+
return true;
176+
}
177+
178+
// Returns the character with the given name or name alias, or U_SENTINEL if no such character
179+
// exists.
180+
template<typename CharT>
181+
UChar32 getCharacterByName(const std::basic_string_view<CharT> name) {
182+
// Must munge name, since u_charFromName() does not do 'loose' matching.
183+
char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
184+
if (!mungeCharName(name, buf, sizeof(buf))) {
185+
return U_SENTINEL;
186+
}
187+
for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) {
188+
UErrorCode ec = U_ZERO_ERROR;
189+
UChar32 ch = u_charFromName(nameChoice, buf, &ec);
190+
if (U_SUCCESS(ec)) {
191+
return ch;
192+
}
193+
}
194+
return U_SENTINEL;
195+
}
196+
150197
} // namespace
151198

152199
//----------------------------------------------------------------
@@ -657,19 +704,14 @@ class UnicodeSet::Lexer {
657704
}
658705
start = parsePosition_.getIndex();
659706
} else if (last == u'}') {
660-
UnicodeSet result;
661-
result.applyPropertyAlias(
662-
UnicodeString(NAME_PROP),
663-
pattern_.tempSubStringBetween(start, parsePosition_.getIndex() - 1),
664-
errorCode);
665-
result.setPattern(
666-
pattern_.tempSubStringBetween(start - 3, parsePosition_.getIndex()));
667-
if ((hex.has_value() && result.charAt(0) != hex) ||
668-
(literal.has_value() && result.charAt(0) != literal)) {
707+
const UChar32 result = getCharacterByName(std::u16string_view(pattern_).substr(
708+
start, parsePosition_.getIndex() - 1 - start));
709+
if (result < 0 || (hex.has_value() && result != hex) ||
710+
(literal.has_value() && result != literal)) {
669711
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
670712
return {};
671713
}
672-
return result.charAt(0);
714+
return result;
673715
}
674716
}
675717
}
@@ -1312,23 +1354,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
13121354

13131355
namespace {
13141356

1315-
UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
1316-
/* Note: we use ' ' in compiler code page */
1317-
int32_t j = 0;
1318-
char ch;
1319-
--dstCapacity; /* make room for term. zero */
1320-
while ((ch = *src++) != 0) {
1321-
if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
1322-
continue;
1323-
}
1324-
if (j >= dstCapacity) return false;
1325-
dst[j++] = ch;
1326-
}
1327-
if (j > 0 && dst[j-1] == ' ') --j;
1328-
dst[j] = 0;
1329-
return true;
1330-
}
1331-
13321357
} // namespace
13331358

13341359
//----------------------------------------------------------------
@@ -1452,18 +1477,14 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
14521477
}
14531478
case UCHAR_NAME:
14541479
{
1455-
// Must munge name, since u_charFromName() does not do
1456-
// 'loose' matching.
1457-
char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1458-
if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1459-
UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
1460-
if (U_SUCCESS(ec)) {
1461-
clear();
1462-
add(ch);
1463-
return *this;
1464-
} else {
1480+
const UChar32 ch =
1481+
getCharacterByName<char>(std::string_view(vname.data(), vname.length()));
1482+
if (ch < 0) {
14651483
FAIL(ec);
14661484
}
1485+
clear();
1486+
add(ch);
1487+
return *this;
14671488
}
14681489
case UCHAR_UNICODE_1_NAME:
14691490
// ICU 49 deprecates the Unicode_1_Name property APIs.
@@ -1473,7 +1494,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
14731494
// Must munge name, since u_versionFromString() does not do
14741495
// 'loose' matching.
14751496
char buf[128];
1476-
if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1497+
if (!mungeCharName(std::string_view(vname.data(), vname.length()), buf,
1498+
sizeof(buf)))
1499+
FAIL(ec);
14771500
UVersionInfo version;
14781501
u_versionFromString(version, buf);
14791502
applyFilter(versionFilter, &version,

icu4c/source/test/intltest/usettest.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4713,6 +4713,12 @@ void UnicodeSetTest::TestToPatternOutput() {
47134713
// Ill-formed in ICU4C 78 and earlier, made well-formed by ICU-23312.
47144714
{u"[a-{z}]", u"[a-z]"},
47154715
{u"[{a}-z]", u"[a-z]"},
4716+
{uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}])", u"[︘]"},
4717+
{uR"([\N{bell}])", u"[🔔]"},
4718+
// Ill-formed in ICU 78 and earlier:
4719+
{uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}])", u"[︘]"},
4720+
{uR"([\N{Hangul jungseong O-E}])", u"[ᆀ]"},
4721+
{uR"([\N{Hangul jungseong OE}])", u"[ᅬ]"},
47164722
}) {
47174723
UErrorCode errorCode = U_ZERO_ERROR;
47184724
const UnicodeSet set(expression, errorCode);
@@ -4832,6 +4838,10 @@ void UnicodeSetTest::TestParseErrors() {
48324838
// Doubly negated property queries.
48334839
uR"(\P{Decomposition_Type≠compat})",
48344840
u"[:^Noncharacter_Code_Point≠No:]",
4841+
// This should be [\a]; tracked by ICU-8963.
4842+
uR"([\N{BEL}])",
4843+
// This should be [œ]; tracked by ICU-3736.
4844+
uR"([\N{Latin small ligature o-e}])",
48354845
}) {
48364846
UErrorCode errorCode = U_ZERO_ERROR;
48374847
const UnicodeSet set(expression, errorCode);

0 commit comments

Comments
 (0)