Skip to content

Commit 5579708

Browse files
[3.14] pythongh-60462: Fix locale.strxfrm() on Solaris (pythonGH-138242) (pythonGH-138448)
It should interpret the result of wcsxfrm() as a sequence of abstract integers, not a sequence of Unicode code points or using other encoding scheme that does not preserve ordering. (cherry picked from commit 482fd0c) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 4e8f5e6 commit 5579708

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`locale.strxfrm` on Solaris (and possibly other platforms).

Modules/_localemodule.c

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,54 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str)
485485
goto exit;
486486
}
487487
}
488-
result = PyUnicode_FromWideChar(buf, n2);
488+
/* The result is just a sequence of integers, they are not necessary
489+
Unicode code points, so PyUnicode_FromWideChar() cannot be used
490+
here. For example, 0xD83D 0xDC0D should not be larger than 0xFF41.
491+
*/
492+
#if SIZEOF_WCHAR_T == 4
493+
{
494+
/* Some codes can exceed the range of Unicode code points
495+
(0 - 0x10FFFF), so they cannot be directly used in
496+
PyUnicode_FromKindAndData(). They should be first encoded in
497+
a way that preserves the lexicographical order.
498+
499+
Codes in the range 0-0xFFFF represent themself.
500+
Codes larger than 0xFFFF are encoded as a pair:
501+
* 0x1xxxx -- the highest 16 bits
502+
* 0x0xxxx -- the lowest 16 bits
503+
*/
504+
size_t n3 = 0;
505+
for (size_t i = 0; i < n2; i++) {
506+
if ((Py_UCS4)buf[i] > 0x10000u) {
507+
n3++;
508+
}
509+
}
510+
if (n3) {
511+
n3 += n2; // no integer overflow
512+
Py_UCS4 *buf2 = PyMem_New(Py_UCS4, n3);
513+
if (buf2 == NULL) {
514+
PyErr_NoMemory();
515+
goto exit;
516+
}
517+
size_t j = 0;
518+
for (size_t i = 0; i < n2; i++) {
519+
Py_UCS4 c = (Py_UCS4)buf[i];
520+
if (c > 0x10000u) {
521+
buf2[j++] = (c >> 16) | 0x10000u;
522+
buf2[j++] = c & 0xFFFFu;
523+
}
524+
else {
525+
buf2[j++] = c;
526+
}
527+
}
528+
assert(j == n3);
529+
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf2, n3);
530+
PyMem_Free(buf2);
531+
goto exit;
532+
}
533+
}
534+
#endif
535+
result = PyUnicode_FromKindAndData(sizeof(wchar_t), buf, n2);
489536
exit:
490537
PyMem_Free(buf);
491538
PyMem_Free(s);

0 commit comments

Comments
 (0)