Support name aliases of type correction in UnicodeSet \N

eggrobin · eggrobin · commit 5df5359ea23e · 2026-03-31T23:25:16.000+02:00
diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp
@@ -59,9 +59,6 @@ constexpr char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
 constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
 constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
 
-// Unicode name property alias
-constexpr char16_t NAME_PROP[] = u"na";
-
 }  // namespace
 
 // Cached sets ------------------------------------------------------------- ***
@@ -147,6 +144,56 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
 // memory leak checker tools
 #define _dbgct(me)
 
+// Strips leading and trailing spaces and turns runs of spaces into single spaces.
+// This should be replaced by UAX44-LM1 and UAX44-LM2 skeletonizations as part of ICU-3736.
+template<typename CharT>
+UBool mungeCharName(std::basic_string_view<CharT> src, char* dst, int32_t dstCapacity) {
+    int32_t j = 0;
+    --dstCapacity; /* make room for term. zero */
+    if constexpr (!std::is_same_v<CharT, char>) {
+        if (!uprv_isInvariantUString(src.data(), static_cast<int32_t>(src.size()))) {
+            return false;
+        }
+    }
+    for (CharT uch : src) {
+        char ch;
+        if constexpr (std::is_same_v<CharT, char>) {
+            ch = uch;
+        } else {
+            // This would want to be UCHAR_TO_CHAR but that is defined in uinvchar.cpp. This function
+            // should not last long anyway (famous last words)…
+            u_UCharsToChars(&uch, &ch, 1);
+        }
+        if (ch == ' ' && (j == 0 || (j > 0 && dst[j - 1] == ' '))) {
+            continue;
+        }
+        if (j >= dstCapacity) return false;
+        dst[j++] = ch;
+    }
+    if (j > 0 && dst[j-1] == ' ') --j;
+    dst[j] = 0;
+    return true;
+}
+
+// Returns the character with the given name or name alias, or U_SENTINEL if no such character
+// exists.
+template<typename CharT>
+UChar32 getCharacterByName(const std::basic_string_view<CharT> name) {
+    // Must munge name, since u_charFromName() does not do 'loose' matching.
+    char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
+    if (!mungeCharName(name, buf, sizeof(buf))) {
+        return U_SENTINEL;
+    }
+    for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) {
+        UErrorCode ec = U_ZERO_ERROR;
+        UChar32 ch = u_charFromName(nameChoice, buf, &ec);
+        if (U_SUCCESS(ec)) {
+            return ch;
+        }
+    }
+    return U_SENTINEL;
+}
+
 }  // namespace
 
 //----------------------------------------------------------------
@@ -657,19 +704,14 @@ class UnicodeSet::Lexer {
                     }
                     start = parsePosition_.getIndex();
                 } else if (last == u'}') {
-                    UnicodeSet result;
-                    result.applyPropertyAlias(
-                        UnicodeString(NAME_PROP),
-                        pattern_.tempSubStringBetween(start, parsePosition_.getIndex() - 1),
-                        errorCode);
-                    result.setPattern(
-                        pattern_.tempSubStringBetween(start - 3, parsePosition_.getIndex()));
-                    if ((hex.has_value() && result.charAt(0) != hex) ||
-                        (literal.has_value() && result.charAt(0) != literal)) {
+                    const UChar32 result = getCharacterByName(std::u16string_view(pattern_).substr(
+                        start, parsePosition_.getIndex() - 1 - start));
+                    if (result < 0 || (hex.has_value() && result != hex) ||
+                            (literal.has_value() && result != literal)) {
                         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
                         return {};
                     }
-                    return result.charAt(0);
+                    return result;
                 }
             }
         }
@@ -1312,23 +1354,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
 
 namespace {
 
-UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
-    /* Note: we use ' ' in compiler code page */
-    int32_t j = 0;
-    char ch;
-    --dstCapacity; /* make room for term. zero */
-    while ((ch = *src++) != 0) {
-        if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
-            continue;
-        }
-        if (j >= dstCapacity) return false;
-        dst[j++] = ch;
-    }
-    if (j > 0 && dst[j-1] == ' ') --j;
-    dst[j] = 0;
-    return true;
-}
-
 }  // namespace
 
 //----------------------------------------------------------------
@@ -1452,18 +1477,14 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
                 }
             case UCHAR_NAME:
                 {
-                    // Must munge name, since u_charFromName() does not do
-                    // 'loose' matching.
-                    char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
-                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
-                    UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
-                    if (U_SUCCESS(ec)) {
-                        clear();
-                        add(ch);
-                        return *this;
-                    } else {
+                    const UChar32 ch =
+                        getCharacterByName<char>(std::string_view(vname.data(), vname.length()));
+                    if (ch < 0) {
                         FAIL(ec);
                     }
+                    clear();
+                    add(ch);
+                    return *this;
                 }
             case UCHAR_UNICODE_1_NAME:
                 // ICU 49 deprecates the Unicode_1_Name property APIs.
@@ -1473,7 +1494,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
                     // Must munge name, since u_versionFromString() does not do
                     // 'loose' matching.
                     char buf[128];
-                    if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
+                    if (!mungeCharName(std::string_view(vname.data(), vname.length()), buf,
+                                       sizeof(buf)))
+                        FAIL(ec);
                     UVersionInfo version;
                     u_versionFromString(version, buf);
                     applyFilter(versionFilter, &version,
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
@@ -4713,6 +4713,12 @@ void UnicodeSetTest::TestToPatternOutput() {
             // Ill-formed in ICU4C 78 and earlier, made well-formed by ICU-23312.
             {u"[a-{z}]", u"[a-z]"},
             {u"[{a}-z]", u"[a-z]"},
+            {uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}])", u"[︘]"},
+            {uR"([\N{bell}])", u"[🔔]"},
+            // Ill-formed in ICU 78 and earlier:
+            {uR"([\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}])", u"[︘]"},
+            {uR"([\N{Hangul jungseong O-E}])", u"[ᆀ]"},
+            {uR"([\N{Hangul jungseong OE}])", u"[ᅬ]"},
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);
@@ -4832,6 +4838,10 @@ void UnicodeSetTest::TestParseErrors() {
             // Doubly negated property queries.
             uR"(\P{Decomposition_Type≠compat})",
             u"[:^Noncharacter_Code_Point≠No:]",
+            // This should be [\a]; tracked by ICU-8963.
+            uR"([\N{BEL}])",
+            // This should be [œ]; tracked by ICU-3736.
+            uR"([\N{Latin small ligature o-e}])",
         }) {
         UErrorCode errorCode = U_ZERO_ERROR;
         const UnicodeSet set(expression, errorCode);