@@ -9967,8 +9967,106 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string,
9967
9967
s = sans_highs ;
9968
9968
}
9969
9969
}
9970
- /* else // Here both the locale and string are UTF-8 */
9971
- /* XXX convert above Unicode to 10FFFF? */
9970
+ else { /* Here both the locale and string are UTF-8 */
9971
+
9972
+ /* In a UTF-8 locale, we can reasonably expect strxfrm() to properly
9973
+ * handle any legal Unicode code point, including the non-character
9974
+ * code points that are affirmed legal in Corrigendum #9. Less certain
9975
+ * is its handling of the surrogate characters, and those code points
9976
+ * above the Unicode maximum of U+10FFFF. It definitely won't know
9977
+ * about Perl's invented UTF-8 extension for very large code points.
9978
+ * Since surrogates and above-Unicode code points were formerly legal
9979
+ * UTF-8, it very well may be that strxfrm() handles them, rather than
9980
+ * going to the likely extra trouble of detecting and excluding them.
9981
+ * This is especially true of surrogates where the code points the
9982
+ * UTF-8 represents are listed in the Unicode Standard as being in a
9983
+ * subset of the General Category "Other". Indeed, glibc looks like it
9984
+ * returns the identical collation sequence for all "Other" code points
9985
+ * that have the same number of bytes in their representation. That
9986
+ * is, all such code points collate to the same spot. glibc does the
9987
+ * same for the above-Unicode code points, but it gets a little weird,
9988
+ * as might be expected, when presented with Perl's invented UTF-8
9989
+ * extension, but still serviceable. But it is really undefined
9990
+ * behavior, and we therefore should not present strxfrm with such
9991
+ * input. The code below does that. And it is just about as easy to
9992
+ * exclude all above-Unicode code points, as that is really undefined
9993
+ * behavior as well, so the code below does that too. These all are
9994
+ * effectively permanently unassigned by Unicode, so the code below
9995
+ * maps them all to the highest legal permanently unassigned code
9996
+ * point, U+10FFFF. XXX Could use find_next_masked() instead of
9997
+ * strpbrk() on ASCII platforms to do per-word scanning */
9998
+
9999
+ # ifdef EBCDIC /* Native; known valid only for IBM-1047, 037 */
10000
+ # define SUPER_START_BYTES "\xEE\xEF\xFA\xFB\xFC\xFD\xFE"
10001
+ # else
10002
+ # define SUPER_START_BYTES \
10003
+ "\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"
10004
+ # endif
10005
+
10006
+ const char * const e = s + len ;
10007
+
10008
+ /* Scan the input to find something that could be the start byte for an
10009
+ * above-Unicode code point. If none found, we are done. */
10010
+ char * candidate = s ;
10011
+ while ((candidate = strpbrk (candidate , SUPER_START_BYTES ))) {
10012
+ char * next_char_start = candidate + UTF8SKIP (candidate );
10013
+ assert (next_char_start <= e );
10014
+
10015
+ /* It may require more than the single start byte to determine if a
10016
+ * sequence is for an above-Unicode code point. Look to determine
10017
+ * for sure. If the sequence isn't for an above-Unicode code
10018
+ * point, continue scanning for the next possible one. */
10019
+ if (! UTF8_IS_SUPER_NO_CHECK_ (candidate )) {
10020
+ candidate = next_char_start ;
10021
+ continue ;
10022
+ }
10023
+
10024
+ /* Here, is above-Unicode. Need to make a copy to translate this
10025
+ * code code point (and any others that follow) to be within the
10026
+ * Unicode range */
10027
+ Newx (sans_highs , len + 1 , char ); /* May shrink; will never grow */
10028
+ Size_t initial_length = candidate - s ;
10029
+
10030
+ /* Copy as-is any initial portion that is Unicode */
10031
+ Copy (s , sans_highs , initial_length , U8 );
10032
+
10033
+ /* Replace this first above-Unicode character */
10034
+ char * d = sans_highs + initial_length ;
10035
+ Copy (MAX_UNICODE_UTF8 , d , STRLENs (MAX_UNICODE_UTF8 ), U8 );
10036
+ d += STRLENs (MAX_UNICODE_UTF8 );
10037
+
10038
+ /* Then go through the rest of the string */
10039
+ s = next_char_start ;
10040
+ while (s < e ) {
10041
+ if (UTF8_IS_INVARIANT (* s )) {
10042
+ * d ++ = * s ++ ;
10043
+ continue ;
10044
+ }
10045
+
10046
+ const Size_t this_len = UTF8SKIP (s );
10047
+ next_char_start = s + this_len ;
10048
+ assert (next_char_start <= e );
10049
+
10050
+ if (UTF8_IS_SUPER_NO_CHECK_ (s )) {
10051
+ Copy (MAX_UNICODE_UTF8 , d , STRLENs (MAX_UNICODE_UTF8 ), U8 );
10052
+ d += STRLENs (MAX_UNICODE_UTF8 );
10053
+ }
10054
+ else {
10055
+ Copy (s , d , this_len , U8 );
10056
+ d += this_len ;
10057
+ }
10058
+
10059
+ s = next_char_start ;
10060
+ }
10061
+
10062
+ len = d - sans_highs ;
10063
+ * d = '\0' ;
10064
+
10065
+ /* The rest of the routine will look at this modified copy */
10066
+ s = sans_highs ;
10067
+ break ;
10068
+ }
10069
+ }
9972
10070
9973
10071
length_in_chars = (utf8 )
9974
10072
? utf8_length ((U8 * ) s , (U8 * ) s + len )
0 commit comments