From 60a548123a35cd94eb906a050afc89c2fee7725b Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 29 Aug 2025 15:35:47 +0300 Subject: [PATCH 1/4] gh-138247: Fix locale.strxfrm() It should interpret the result of wcsxfrm() as a sequence of abstract integers, not a sequence of Unicode code points or using other encoding scheme that does not preserve ordering. --- Modules/_localemodule.c | 49 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 17b5220fd6f9e1..82b70dab857bdc 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -483,7 +483,54 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str) goto exit; } } - result = PyUnicode_FromWideChar(buf, n2); + /* The result is just a sequence of integers, they are not necessary + Unicode code points, so PyUnicode_FromWideChar() cannot be used + here. For example, 0xD83D 0xDC0D should not be larger than 0xFF41. + */ +#if SIZEOF_WCHAR_T == 4 + { + /* Some codes can exceed the range of Unicode code points + (0 - 0x10FFFF), so they cannot be directly used in + PyUnicode_FromKindAndData(). They should be first encoded in + a way that preserves the lexicographical order. + + Codes in the range 0-0xFFFF represent themself. + Codes larger than 0xFFFF are encoded as a pair: + * 0x1xxxx -- the highest 16 bits + * 0x0xxxx -- the lowest 16 bits + */ + size_t n3 = 0; + for (size_t i = 0; i < n2; i++) { + if (buf[i] > 0x10000) { + n3++; + } + } + if (n3) { + n3 += n2; // no integer overflow + Py_UCS4 *buf2 = PyMem_New(Py_UCS4, n3); + if (buf2 == NULL) { + PyErr_NoMemory(); + goto exit; + } + size_t j = 0; + for (size_t i = 0; i < n2; i++) { + wchar_t c = buf[i]; + if (c > 0x10000) { + buf2[j++] = (((Py_UCS4)c) >> 16) | 0x10000u; + buf2[j++] = ((Py_UCS4)c) & 0xffffu; + } + else { + buf2[j++] = c; + } + } + assert(j == n3); + result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf2, n3); + PyMem_Free(buf2); + goto exit; + } + } +#endif + result = PyUnicode_FromKindAndData(sizeof(wchar_t), buf, n2); exit: PyMem_Free(buf); PyMem_Free(s); From 58713db29926ec2090c6100fff9bfd0ddaf4477c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 30 Aug 2025 10:04:49 +0300 Subject: [PATCH 2/4] Add a NEWS entry. --- .../next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst diff --git a/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst b/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst new file mode 100644 index 00000000000000..1365b1bfdf28f6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst @@ -0,0 +1 @@ +Fix :func:`locale.strxfrm` on Solaris (and possibly other platforms). From 9aa3c9dd4ddbb98b6e222700fd0d5772ea6ba748 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 2 Sep 2025 09:56:16 +0300 Subject: [PATCH 3/4] Move to gh-60462. --- ...7.yh_vDc.rst => 2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/Library/{2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst => 2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst} (100%) diff --git a/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst b/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-138247.yh_vDc.rst rename to Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst From 349c3b2a7245addb613010af5d89ec799ecf0369 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 3 Sep 2025 10:32:45 +0300 Subject: [PATCH 4/4] Ensure that it works for signed wchar_t. --- Modules/_localemodule.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index 82b70dab857bdc..e86d5b17d1759d 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -501,7 +501,7 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str) */ size_t n3 = 0; for (size_t i = 0; i < n2; i++) { - if (buf[i] > 0x10000) { + if ((Py_UCS4)buf[i] > 0x10000u) { n3++; } } @@ -514,10 +514,10 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str) } size_t j = 0; for (size_t i = 0; i < n2; i++) { - wchar_t c = buf[i]; - if (c > 0x10000) { - buf2[j++] = (((Py_UCS4)c) >> 16) | 0x10000u; - buf2[j++] = ((Py_UCS4)c) & 0xffffu; + Py_UCS4 c = (Py_UCS4)buf[i]; + if (c > 0x10000u) { + buf2[j++] = (c >> 16) | 0x10000u; + buf2[j++] = c & 0xFFFFu; } else { buf2[j++] = c;