Skip to content

Commit d274343

Browse files
serhiy-storchakamiss-islington
authored andcommitted
gh-60462: Fix locale.strxfrm() on Solaris (GH-138242)
It should interpret the result of wcsxfrm() as a sequence of abstract integers, not a sequence of Unicode code points or using other encoding scheme that does not preserve ordering. (cherry picked from commit 482fd0c) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 44b3940 commit d274343

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix :func:`locale.strxfrm` on Solaris (and possibly other platforms).

Modules/_localemodule.c

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,54 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str)
436436
goto exit;
437437
}
438438
}
439-
result = PyUnicode_FromWideChar(buf, n2);
439+
/* The result is just a sequence of integers, they are not necessary
440+
Unicode code points, so PyUnicode_FromWideChar() cannot be used
441+
here. For example, 0xD83D 0xDC0D should not be larger than 0xFF41.
442+
*/
443+
#if SIZEOF_WCHAR_T == 4
444+
{
445+
/* Some codes can exceed the range of Unicode code points
446+
(0 - 0x10FFFF), so they cannot be directly used in
447+
PyUnicode_FromKindAndData(). They should be first encoded in
448+
a way that preserves the lexicographical order.
449+
450+
Codes in the range 0-0xFFFF represent themself.
451+
Codes larger than 0xFFFF are encoded as a pair:
452+
* 0x1xxxx -- the highest 16 bits
453+
* 0x0xxxx -- the lowest 16 bits
454+
*/
455+
size_t n3 = 0;
456+
for (size_t i = 0; i < n2; i++) {
457+
if ((Py_UCS4)buf[i] > 0x10000u) {
458+
n3++;
459+
}
460+
}
461+
if (n3) {
462+
n3 += n2; // no integer overflow
463+
Py_UCS4 *buf2 = PyMem_New(Py_UCS4, n3);
464+
if (buf2 == NULL) {
465+
PyErr_NoMemory();
466+
goto exit;
467+
}
468+
size_t j = 0;
469+
for (size_t i = 0; i < n2; i++) {
470+
Py_UCS4 c = (Py_UCS4)buf[i];
471+
if (c > 0x10000u) {
472+
buf2[j++] = (c >> 16) | 0x10000u;
473+
buf2[j++] = c & 0xFFFFu;
474+
}
475+
else {
476+
buf2[j++] = c;
477+
}
478+
}
479+
assert(j == n3);
480+
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf2, n3);
481+
PyMem_Free(buf2);
482+
goto exit;
483+
}
484+
}
485+
#endif
486+
result = PyUnicode_FromKindAndData(sizeof(wchar_t), buf, n2);
440487
exit:
441488
PyMem_Free(buf);
442489
PyMem_Free(s);

0 commit comments

Comments
 (0)