@@ -59,9 +59,6 @@ constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
5959constexpr char ASCII[] = " ASCII" ; // [\u0000-\u007F]
6060constexpr char ASSIGNED[] = " Assigned" ; // [:^Cn:]
6161
62- // Unicode name property alias
63- constexpr char16_t NAME_PROP[] = u" na" ;
64-
6562} // namespace
6663
6764// Cached sets ------------------------------------------------------------- ***
@@ -147,6 +144,56 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
147144// memory leak checker tools
148145#define _dbgct (me )
149146
147+ // Strips leading and trailing spaces and turns runs of spaces into single spaces.
148+ // This should be replaced by UAX44-LM1 and UAX44-LM2 skeletonizations as part of ICU-3736.
149+ template <typename CharT>
150+ UBool mungeCharName (std::basic_string_view<CharT> src, char * dst, int32_t dstCapacity) {
151+ int32_t j = 0 ;
152+ --dstCapacity; /* make room for term. zero */
153+ if constexpr (!std::is_same_v<CharT, char >) {
154+ if (!uprv_isInvariantUString (src.data (), static_cast <int32_t >(src.size ()))) {
155+ return false ;
156+ }
157+ }
158+ for (CharT uch : src) {
159+ char ch;
160+ if constexpr (std::is_same_v<CharT, char >) {
161+ ch = uch;
162+ } else {
163+ // This would want to be UCHAR_TO_CHAR but that is defined in uinvchar.cpp. This function
164+ // should not last long anyway (famous last words)…
165+ u_UCharsToChars (&uch, &ch, 1 );
166+ }
167+ if (ch == ' ' && (j == 0 || (j > 0 && dst[j - 1 ] == ' ' ))) {
168+ continue ;
169+ }
170+ if (j >= dstCapacity) return false ;
171+ dst[j++] = ch;
172+ }
173+ if (j > 0 && dst[j-1 ] == ' ' ) --j;
174+ dst[j] = 0 ;
175+ return true ;
176+ }
177+
178+ // Returns the character with the given name or name alias, or U_SENTINEL if no such character
179+ // exists.
180+ template <typename CharT>
181+ UChar32 getCharacterByName (const std::basic_string_view<CharT> name) {
182+ // Must munge name, since u_charFromName() does not do 'loose' matching.
183+ char buf[128 ]; // it suffices that this be > uprv_getMaxCharNameLength
184+ if (!mungeCharName (name, buf, sizeof (buf))) {
185+ return U_SENTINEL;
186+ }
187+ for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) {
188+ UErrorCode ec = U_ZERO_ERROR;
189+ UChar32 ch = u_charFromName (nameChoice, buf, &ec);
190+ if (U_SUCCESS (ec)) {
191+ return ch;
192+ }
193+ }
194+ return U_SENTINEL;
195+ }
196+
150197} // namespace
151198
152199// ----------------------------------------------------------------
@@ -657,19 +704,14 @@ class UnicodeSet::Lexer {
657704 }
658705 start = parsePosition_.getIndex ();
659706 } else if (last == u' }' ) {
660- UnicodeSet result;
661- result.applyPropertyAlias (
662- UnicodeString (NAME_PROP),
663- pattern_.tempSubStringBetween (start, parsePosition_.getIndex () - 1 ),
664- errorCode);
665- result.setPattern (
666- pattern_.tempSubStringBetween (start - 3 , parsePosition_.getIndex ()));
667- if ((hex.has_value () && result.charAt (0 ) != hex) ||
668- (literal.has_value () && result.charAt (0 ) != literal)) {
707+ const UChar32 result = getCharacterByName (std::u16string_view (pattern_).substr (
708+ start, parsePosition_.getIndex () - 1 - start));
709+ if (result < 0 || (hex.has_value () && result != hex) ||
710+ (literal.has_value () && result != literal)) {
669711 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
670712 return {};
671713 }
672- return result. charAt ( 0 ) ;
714+ return result;
673715 }
674716 }
675717 }
@@ -1312,23 +1354,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
13121354
13131355namespace {
13141356
1315- UBool mungeCharName (char * dst, const char * src, int32_t dstCapacity) {
1316- /* Note: we use ' ' in compiler code page */
1317- int32_t j = 0 ;
1318- char ch;
1319- --dstCapacity; /* make room for term. zero */
1320- while ((ch = *src++) != 0 ) {
1321- if (ch == ' ' && (j==0 || (j>0 && dst[j-1 ]==' ' ))) {
1322- continue ;
1323- }
1324- if (j >= dstCapacity) return false ;
1325- dst[j++] = ch;
1326- }
1327- if (j > 0 && dst[j-1 ] == ' ' ) --j;
1328- dst[j] = 0 ;
1329- return true ;
1330- }
1331-
13321357} // namespace
13331358
13341359// ----------------------------------------------------------------
@@ -1452,18 +1477,14 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
14521477 }
14531478 case UCHAR_NAME:
14541479 {
1455- // Must munge name, since u_charFromName() does not do
1456- // 'loose' matching.
1457- char buf[128 ]; // it suffices that this be > uprv_getMaxCharNameLength
1458- if (!mungeCharName (buf, vname.data (), sizeof (buf))) FAIL (ec);
1459- UChar32 ch = u_charFromName (U_EXTENDED_CHAR_NAME, buf, &ec);
1460- if (U_SUCCESS (ec)) {
1461- clear ();
1462- add (ch);
1463- return *this ;
1464- } else {
1480+ const UChar32 ch =
1481+ getCharacterByName<char >(std::string_view (vname.data (), vname.length ()));
1482+ if (ch < 0 ) {
14651483 FAIL (ec);
14661484 }
1485+ clear ();
1486+ add (ch);
1487+ return *this ;
14671488 }
14681489 case UCHAR_UNICODE_1_NAME:
14691490 // ICU 49 deprecates the Unicode_1_Name property APIs.
@@ -1473,7 +1494,9 @@ UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
14731494 // Must munge name, since u_versionFromString() does not do
14741495 // 'loose' matching.
14751496 char buf[128 ];
1476- if (!mungeCharName (buf, vname.data (), sizeof (buf))) FAIL (ec);
1497+ if (!mungeCharName (std::string_view (vname.data (), vname.length ()), buf,
1498+ sizeof (buf)))
1499+ FAIL (ec);
14771500 UVersionInfo version;
14781501 u_versionFromString (version, buf);
14791502 applyFilter (versionFilter, &version,
0 commit comments