Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 64 additions & 41 deletions icu4c/source/common/uniset_props.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@ constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]

// Unicode name property alias
constexpr char16_t NAME_PROP[] = u"na";

} // namespace

// Cached sets ------------------------------------------------------------- ***
Expand Down Expand Up @@ -147,6 +144,56 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
// memory leak checker tools
#define _dbgct(me)

// Strips leading and trailing spaces and turns runs of spaces into single spaces.
// This should be replaced by UAX44-LM1 and UAX44-LM2 skeletonizations as part of ICU-3736.
template<typename CharT>
UBool mungeCharName(std::basic_string_view<CharT> src, char* dst, int32_t dstCapacity) {
int32_t j = 0;
--dstCapacity; /* make room for term. zero */
if constexpr (!std::is_same_v<CharT, char>) {
if (!uprv_isInvariantUString(src.data(), static_cast<int32_t>(src.size()))) {
return false;
}
}
for (CharT uch : src) {
char ch;
if constexpr (std::is_same_v<CharT, char>) {
ch = uch;
} else {
// This would want to be UCHAR_TO_CHAR but that is defined in uinvchar.cpp. This function
// should not last long anyway (famous last words)…
u_UCharsToChars(&uch, &ch, 1);
}
if (ch == ' ' && (j == 0 || (j > 0 && dst[j - 1] == ' '))) {
continue;
}
if (j >= dstCapacity) return false;
dst[j++] = ch;
}
if (j > 0 && dst[j-1] == ' ') --j;
dst[j] = 0;
return true;
}

// Returns the character with the given name or name alias, or U_SENTINEL if no such character
// exists.
template<typename CharT>
UChar32 getCharacterByName(const std::basic_string_view<CharT> name) {
// Must munge name, since u_charFromName() does not do 'loose' matching.
char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
if (!mungeCharName(name, buf, sizeof(buf))) {
return U_SENTINEL;
}
for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) {
UErrorCode ec = U_ZERO_ERROR;
UChar32 ch = u_charFromName(nameChoice, buf, &ec);
if (U_SUCCESS(ec)) {
return ch;
}
}
return U_SENTINEL;
}

} // namespace

//----------------------------------------------------------------
Expand Down Expand Up @@ -657,19 +704,14 @@ class UnicodeSet::Lexer {
}
start = parsePosition_.getIndex();
} else if (last == u'}') {
UnicodeSet result;
result.applyPropertyAlias(
UnicodeString(NAME_PROP),
pattern_.tempSubStringBetween(start, parsePosition_.getIndex() - 1),
errorCode);
result.setPattern(
pattern_.tempSubStringBetween(start - 3, parsePosition_.getIndex()));
if ((hex.has_value() && result.charAt(0) != hex) ||
(literal.has_value() && result.charAt(0) != literal)) {
const UChar32 result = getCharacterByName(std::u16string_view(pattern_).substr(
start, parsePosition_.getIndex() - 1 - start));
if (result < 0 || (hex.has_value() && result != hex) ||
(literal.has_value() && result != literal)) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return {};
}
return result.charAt(0);
return result;
}
}
}
Expand Down Expand Up @@ -1312,23 +1354,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,

namespace {

UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
/* Note: we use ' ' in compiler code page */
int32_t j = 0;
char ch;
--dstCapacity; /* make room for term. zero */
while ((ch = *src++) != 0) {
if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
continue;
}
if (j >= dstCapacity) return false;
dst[j++] = ch;
}
if (j > 0 && dst[j-1] == ' ') --j;
dst[j] = 0;
return true;
}

} // namespace

//----------------------------------------------------------------
Expand Down Expand Up @@ -1452,18 +1477,14 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
}
case UCHAR_NAME:
{
// Must munge name, since u_charFromName() does not do
// 'loose' matching.
char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
if (U_SUCCESS(ec)) {
clear();
add(ch);
return *this;
} else {
const UChar32 ch =
getCharacterByName<char>(std::string_view(vname.data(), vname.length()));
if (ch < 0) {
FAIL(ec);
}
clear();
add(ch);
return *this;
}
case UCHAR_UNICODE_1_NAME:
// ICU 49 deprecates the Unicode_1_Name property APIs.
Expand All @@ -1473,7 +1494,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
// Must munge name, since u_versionFromString() does not do
// 'loose' matching.
char buf[128];
if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
if (!mungeCharName(std::string_view(vname.data(), vname.length()), buf,
sizeof(buf)))
FAIL(ec);
UVersionInfo version;
u_versionFromString(version, buf);
applyFilter(versionFilter, &version,
Expand Down
10 changes: 10 additions & 0 deletions icu4c/source/test/intltest/usettest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4713,6 +4713,12 @@ void UnicodeSetTest::TestToPatternOutput() {
// Ill-formed in ICU4C 78 and earlier, made well-formed by ICU-23312.
{u"[a-{z}]", u"[a-z]"},
{u"[{a}-z]", u"[a-z]"},
{uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}])", u"[︘]"},
{uR"([\N{bell}])", u"[🔔]"},
// Ill-formed in ICU 78 and earlier:
{uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}])", u"[︘]"},
{uR"([\N{Hangul jungseong O-E}])", u"[ᆀ]"},
{uR"([\N{Hangul jungseong OE}])", u"[ᅬ]"},
}) {
UErrorCode errorCode = U_ZERO_ERROR;
const UnicodeSet set(expression, errorCode);
Expand Down Expand Up @@ -4832,6 +4838,10 @@ void UnicodeSetTest::TestParseErrors() {
// Doubly negated property queries.
uR"(\P{Decomposition_Type≠compat})",
u"[:^Noncharacter_Code_Point≠No:]",
// This should be [\a]; tracked by ICU-8963.
uR"([\N{BEL}])",
// This should be [œ]; tracked by ICU-3736.
uR"([\N{Latin small ligature o-e}])",
}) {
UErrorCode errorCode = U_ZERO_ERROR;
const UnicodeSet set(expression, errorCode);
Expand Down
Loading