From 5c44acae940a9c9e9bf790d3138f673a085e4ce4 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 15:28:11 +0200 Subject: [PATCH 01/19] gh-76535: Make `PyUnicode_ToLowerFull` and friends public Make `PyUnicode_ToLowerFull`, `PyUnicode_ToUpperFull` and `PyUnicode_ToTitleFull` public and rename them to `PyUnicode_ToLower` etc. --- Doc/c-api/unicode.rst | 30 ++++++++++++++++++ Include/cpython/unicodeobject.h | 15 +++++++++ Include/internal/pycore_unicodeobject.h | 3 -- Objects/unicodectype.c | 42 +++++++++++++++++-------- Objects/unicodeobject.c | 10 +++--- 5 files changed, 79 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 22b0a6aff6e02e..4ff37b9803e579 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,36 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased + (maximum three), and return the number of characters stored. + Passing a ``NULL`` buffer returns the buffer size needed. + + .. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 73e3bc44d6c9ca..3520ba4b1714e3 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,21 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(int) PyUnicode_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + +PyAPI_FUNC(int) PyUnicode_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res /* Output buffer */ + ); + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index c85c01da89a2ff..d3c759ef81fd52 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,9 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..9f10c02f67fd1a 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -206,15 +206,21 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->lower; + + if (res != NULL) { + res[0] = ch + ctype->lower; + } return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -222,15 +228,20 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->title; + if (res != NULL) { + res[0] = ch + ctype->title; + } return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -238,11 +249,16 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - res[0] = ch + ctype->upper; + if (res != NULL) { + res[0] = ch + ctype->upper; + } return 1; } @@ -258,7 +274,7 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLowerFull(ch, res); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f6384afd1b209..573e9588fe195d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10003,7 +10003,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUnicode_ToLower(c, mapped); } static Py_ssize_t @@ -10014,7 +10014,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10041,7 +10041,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); } else { n_res = 1; @@ -10067,7 +10067,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10120,7 +10120,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped); else - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From b8384aebe1364e17edc1fa63d24ba24d872107a2 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 16:52:21 +0200 Subject: [PATCH 02/19] Address feedback; add size parameter and do PyUnicode_ToFolded as well --- Doc/c-api/unicode.rst | 43 +++++++++++++++-------- Include/cpython/unicodeobject.h | 16 +++++++-- Include/internal/pycore_unicodeobject.h | 1 - Objects/unicodectype.c | 45 +++++++++++++++++++++---- Objects/unicodeobject.c | 22 ++++++------ 5 files changed, 91 insertions(+), 36 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4ff37b9803e579..0f762d93a8d897 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (maximum three), and return the number of characters stored. - Passing a ``NULL`` buffer returns the buffer size needed. + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. Passing a ``NULL`` buffer returns + the buffer size needed. If at some point a buffer overflow is detected, + an :exc:`OverflowError` is raised and ``-1`` is returned. .. versionadded:: next + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 3520ba4b1714e3..c4066ed1df94d6 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( PyAPI_FUNC(int) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); PyAPI_FUNC(int) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res /* Output buffer */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ ); +PyAPI_FUNC(int) PyUnicode_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + int size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index d3c759ef81fd52..bd1a526f572216 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,7 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 9f10c02f67fd1a..2ef667c30a1690 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -208,6 +208,10 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } @@ -215,12 +219,16 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res) } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; } return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; } return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -251,18 +267,26 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res) int i; for (i = 0; i < n; i++) { if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[i] = _PyUnicode_ExtendedCase[index + i]; } } return n; } if (res != NULL) { + if (0 >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; } return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -270,11 +294,18 @@ int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; + for (i = 0; i < n; i++) { + if (res != NULL) { + if (i >= size) { + PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + return -1; + } + res[i] = _PyUnicode_ExtendedCase[index + i]; + } + } return n; } - return PyUnicode_ToLowerFull(ch, res); + return PyUnicode_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 573e9588fe195d..a67cac47341134 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9996,14 +9996,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i static int lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUnicode_ToLower(c, mapped); + return PyUnicode_ToLower(c, mapped, mapped_size); } static Py_ssize_t @@ -10014,14 +10014,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10038,10 +10038,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); } else { n_res = 1; @@ -10065,9 +10065,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToUpper(c, mapped); + n_res = PyUnicode_ToUpper(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10096,7 +10096,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + int j, n_res = PyUnicode_ToFolded(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10118,9 +10118,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); else - n_res = PyUnicode_ToTitle(c, mapped); + n_res = PyUnicode_ToTitle(c, mapped, 3); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From ee4b7073fcbf4d4cfb392e53737d31010f9294ed Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:56:46 +0000 Subject: [PATCH 03/19] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..65b5c45a33a895 --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. From 82d0bcba8ac2dad47288f5892031b14e7eafcc24 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 17:29:35 +0200 Subject: [PATCH 04/19] Address more feedback; assert return value and raise ValueError --- Doc/c-api/unicode.rst | 8 ++++---- Objects/unicodectype.c | 14 +++++++------- Objects/unicodeobject.c | 25 +++++++++++++++---------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 0f762d93a8d897..84bceaa601783c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -313,7 +313,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be lower cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,7 +324,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be upper cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -335,7 +335,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be title cased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -346,7 +346,7 @@ These APIs can be used for fast direct character conversions: able to hold as many characters needed for *ch* to be foldcased, and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`OverflowError` is raised and ``-1`` is returned. + an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 2ef667c30a1690..66a7d9d85e67cd 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -209,7 +209,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -220,7 +220,7 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->lower; @@ -239,7 +239,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -249,7 +249,7 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->title; @@ -268,7 +268,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; @@ -278,7 +278,7 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) } if (res != NULL) { if (0 >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[0] = ch + ctype->upper; @@ -297,7 +297,7 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) for (i = 0; i < n; i++) { if (res != NULL) { if (i >= size) { - PyErr_SetString(PyExc_OverflowError, "output buffer is too small"); + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } res[i] = _PyUnicode_ExtendedCase[index + i]; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a67cac47341134..bf60d58c036591 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10014,14 +10014,16 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped, 3); + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10038,15 +10040,16 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10065,9 +10068,10 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; int n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToUpper(c, mapped, 3); + n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10096,7 +10100,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, 3); + int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10118,10 +10123,10 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m int n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped, 3); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToTitle(c, mapped, 3); - + n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; From 10c282d2688b5ff5d68be468869b51288993dfc9 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:32:04 +0200 Subject: [PATCH 05/19] Add tests --- Lib/test/test_capi/test_unicode.py | 49 +++++++++++++ Modules/_testcapi/unicode.c | 110 +++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..2f9a2e0b8b5b51 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,55 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..01c4caef6e2a01 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,112 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 lower[3]; + int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 upper[3]; + int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 title[3]; + int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + if (PyUnicode_GET_LENGTH(arg) != 1) { + PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + + Py_UCS4 folded[3]; + int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); + assert(chars >= 1); + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); + if (writer == NULL) { + return NULL; + } + if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { + PyUnicodeWriter_Discard(writer); + return NULL; + } + return PyUnicodeWriter_Finish(writer); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +683,10 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; From d7ed1723f9c075c5b2294fed10b8383c2ffeb63b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 1 Jul 2025 18:38:41 +0200 Subject: [PATCH 06/19] Document the maximum numbers of characters needed in the buffer --- Doc/c-api/unicode.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84bceaa601783c..56d093343fbfdd 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -310,7 +310,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased, and + able to hold as many characters needed for *ch* to be lower cased + (e.g. a maximum of two character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -321,7 +322,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased, and + able to hold as many characters needed for *ch* to be upper cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -332,7 +334,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased, and + able to hold as many characters needed for *ch* to be title cased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. @@ -343,7 +346,8 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased, and + able to hold as many characters needed for *ch* to be foldcased + (e.g. a maximum of three character for Unicode 16.0), and return the number of characters stored. Passing a ``NULL`` buffer returns the buffer size needed. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. From 25f1cd808c8ecb2df6a11d209220d7a3bc2c66a0 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 13:56:02 +0200 Subject: [PATCH 07/19] Address feedback; test more characters and refactor _testcapi functions --- Lib/test/test_capi/test_unicode.py | 6 ++ Modules/_testcapi/unicode.c | 90 +++++++----------------------- 2 files changed, 27 insertions(+), 69 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 2f9a2e0b8b5b51..931ce47ed2911e 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1765,6 +1765,7 @@ def test_tolower(self): # Test unicode character self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1778,6 +1779,8 @@ def test_toupper(self): # Test unicode character self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1788,6 +1791,8 @@ def test_totitle(self): # Test unicode character self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1798,6 +1803,7 @@ def test_tofolded(self): # Test unicode character self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") # Test case-ignorable character self.assertEqual(unicode_tofolded("👍"), "👍") diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 01c4caef6e2a01..9959a7c613da48 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,56 +220,46 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } -/* Test PyUnicode_ToLower() */ static PyObject * -unicode_tolower(PyObject *self, PyObject *arg) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tolower only accepts 1-character strings"); + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); return NULL; } - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); - Py_UCS4 lower[3]; - int chars = PyUnicode_ToLower(c, lower, Py_ARRAY_LENGTH(lower)); - assert(chars >= 1); + Py_UCS4 buf[3]; + int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + if (chars <= 0) { + PyErr_BadInternalCall(); + return NULL; + } PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); if (writer == NULL) { return NULL; } - if (PyUnicodeWriter_WriteUCS4(writer, lower, chars) < 0) { + if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { PyUnicodeWriter_Discard(writer); return NULL; } return PyUnicodeWriter_Finish(writer); } +/* Test PyUnicode_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); +} + /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_toupper only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 upper[3]; - int chars = PyUnicode_ToUpper(c, upper, Py_ARRAY_LENGTH(upper)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, upper, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); } @@ -277,52 +267,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_totitle only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 title[3]; - int chars = PyUnicode_ToTitle(c, title, Py_ARRAY_LENGTH(title)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, title, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - if (PyUnicode_GET_LENGTH(arg) != 1) { - PyErr_SetString(PyExc_ValueError, "unicode_tofolded only accepts 1-character strings"); - return NULL; - } - - Py_UCS4 c = PyUnicode_READ_CHAR(arg, 0); - - Py_UCS4 folded[3]; - int chars = PyUnicode_ToFolded(c, folded, Py_ARRAY_LENGTH(folded)); - assert(chars >= 1); - - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, folded, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); } From 5979fdb6b058c55f73372aab69da9f5a8a59051d Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:14:03 +0200 Subject: [PATCH 08/19] Address more review comments --- Modules/_testcapi/unicode.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 9959a7c613da48..057bc3b7a6f1c9 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,10 +221,15 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), const char *name) +unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) { + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + if (PyUnicode_GET_LENGTH(str) != 1) { - PyErr_Format(PyExc_ValueError, "%s only accepts 1-character strings", name); + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); return NULL; } @@ -233,33 +238,24 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int), Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars <= 0) { - PyErr_BadInternalCall(); return NULL; } - PyUnicodeWriter *writer = PyUnicodeWriter_Create(1); - if (writer == NULL) { - return NULL; - } - if (PyUnicodeWriter_WriteUCS4(writer, buf, chars) < 0) { - PyUnicodeWriter_Discard(writer); - return NULL; - } - return PyUnicodeWriter_Finish(writer); + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToLower, "unicode_tolower"); + return unicode_case_operation(arg, PyUnicode_ToLower); } /* Test PyUnicode_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToUpper, "unicode_toupper"); + return unicode_case_operation(arg, PyUnicode_ToUpper); } @@ -267,14 +263,14 @@ unicode_toupper(PyObject *self, PyObject *arg) static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToTitle, "unicode_totitle"); + return unicode_case_operation(arg, PyUnicode_ToTitle); } /* Test PyUnicode_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToFolded, "unicode_tofolded"); + return unicode_case_operation(arg, PyUnicode_ToFolded); } From 769d84ae96e77d722b78ae1981465fb615981ed0 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:19:05 +0200 Subject: [PATCH 09/19] Disallow passing NULL --- Doc/c-api/unicode.rst | 20 +++++------- Objects/unicodectype.c | 73 ++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 54 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 56d093343fbfdd..8e03ac7aa6851c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -312,9 +312,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased (e.g. a maximum of two character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -324,9 +323,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -336,9 +334,8 @@ These APIs can be used for fast direct character conversions: Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next @@ -348,9 +345,8 @@ These APIs can be used for fast direct character conversions: Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased (e.g. a maximum of three character for Unicode 16.0), and - return the number of characters stored. Passing a ``NULL`` buffer returns - the buffer size needed. If at some point a buffer overflow is detected, - an :exc:`ValueError` is raised and ``-1`` is returned. + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. .. versionadded:: next diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 66a7d9d85e67cd..ec0ae918b339ee 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -207,24 +207,20 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->lower >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->lower; + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->lower; return 1; } @@ -237,23 +233,20 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->title >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->title; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->title; return 1; } @@ -266,23 +259,20 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) int n = ctype->upper >> 24; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } - if (res != NULL) { - if (0 >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[0] = ch + ctype->upper; + + if (0 >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[0] = ch + ctype->upper; return 1; } @@ -295,16 +285,15 @@ int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) int n = (ctype->lower >> 20) & 7; int i; for (i = 0; i < n; i++) { - if (res != NULL) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } - res[i] = _PyUnicode_ExtendedCase[index + i]; + if (i >= size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; } + res[i] = _PyUnicode_ExtendedCase[index + i]; } return n; } + return PyUnicode_ToLower(ch, res, size); } From 625ad47608770cd44ecbb64ac7042c43bba2807c Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 14:29:10 +0200 Subject: [PATCH 10/19] Only return NULL when chars < 0 in C test functions Co-authored-by: Victor Stinner --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 057bc3b7a6f1c9..cb1e2df5739211 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -237,7 +237,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 buf[3]; int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); - if (chars <= 0) { + if (chars < 0) { return NULL; } From 3008eb6c5666d1bc0c67c392d2858f326a1f98de Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 15:00:00 +0200 Subject: [PATCH 11/19] Use Py_ssize_t and don't check overflow in loop --- Doc/c-api/unicode.rst | 8 ++--- Include/cpython/unicodeobject.h | 16 ++++----- Modules/_testcapi/unicode.c | 4 +-- Objects/unicodectype.c | 62 ++++++++++++++++----------------- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 8e03ac7aa6851c..3d57cc390f81e0 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,7 +307,7 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased @@ -318,7 +318,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased @@ -329,7 +329,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased @@ -340,7 +340,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size) +.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index c4066ed1df94d6..725937af7afff7 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,28 +733,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(int) PyUnicode_ToLower( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToUpper( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToTitle( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(int) PyUnicode_ToFolded( +PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ - int size /* Buffer size */ + Py_ssize_t size /* Buffer size */ ); diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index cb1e2df5739211..21f6c0f62f11f5 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,7 +221,7 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); @@ -236,7 +236,7 @@ unicode_case_operation(PyObject *str, int (*function)(Py_UCS4, Py_UCS4 *, int)) Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); Py_UCS4 buf[3]; - int chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); if (chars < 0) { return NULL; } diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index ec0ae918b339ee..da70f60b12c450 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,25 +198,25 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -224,25 +224,25 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -250,25 +250,25 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } - if (0 >= size) { + if (size < 1) { PyErr_SetString(PyExc_ValueError, "output buffer is too small"); return -1; } @@ -276,21 +276,21 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size) return 1; } -int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size) +Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; - for (i = 0; i < n; i++) { - if (i >= size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; - } + for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; - } return n; } From 4163898ed7fa77212946a8a4b205fbfc5f23683b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 2 Jul 2025 19:59:13 +0200 Subject: [PATCH 12/19] Use Py_ssize_t for return value variable in unicodeobject.c --- Objects/unicodeobject.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index bf60d58c036591..50ccaac1bb6aea 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9994,9 +9994,9 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped, int mapped_size) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { @@ -10009,8 +10009,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); @@ -10038,7 +10037,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } @@ -10066,7 +10065,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else @@ -10100,7 +10099,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10120,7 +10119,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); From ce6a3a651f10fc25ffc5ec2e95b3d7190e0da784 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Sun, 27 Jul 2025 20:52:58 +0200 Subject: [PATCH 13/19] Address feedback; Rename to PyUCS4_*, define macro and test small buffer case --- Doc/c-api/unicode.rst | 8 ++--- Include/cpython/unicodeobject.h | 10 +++--- Lib/test/test_capi/test_unicode.py | 6 +++- ...5-07-01-14-56-41.gh-issue-76535.9cwObj.rst | 2 +- Modules/_testcapi/unicode.c | 35 +++++++++++++------ Objects/unicodectype.c | 10 +++--- Objects/unicodeobject.c | 12 +++---- 7 files changed, 51 insertions(+), 32 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 3d57cc390f81e0..e19ca45229b92e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,7 +307,7 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be lower cased @@ -318,7 +318,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be upper cased @@ -329,7 +329,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be title cased @@ -340,7 +340,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next -.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be able to hold as many characters needed for *ch* to be foldcased diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 725937af7afff7..662e3f5ab06dcf 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,25 +733,25 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToLower( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToUpper( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToTitle( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ ); -PyAPI_FUNC(Py_ssize_t) PyUnicode_ToFolded( +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( Py_UCS4 ch, /* Unicode character */ Py_UCS4 *res, /* Output buffer */ Py_ssize_t size /* Buffer size */ @@ -792,6 +792,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 931ce47ed2911e..3a5d1a0053f351 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1771,7 +1771,7 @@ def test_tolower(self): @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_toupper(self): import string - from _testcapi import unicode_toupper + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small for i, c in enumerate(string.ascii_lowercase): with self.subTest(c): @@ -1782,6 +1782,10 @@ def test_toupper(self): self.assertEqual(unicode_toupper("ß"), "SS") self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_totitle(self): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst index 65b5c45a33a895..37d251b6e35d8f 100644 --- a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -1 +1 @@ -Make :c:func:`PyUnicode_ToLower`, :c:func:`PyUnicode_ToUpper`, :c:func:`PyUnicode_ToTitle` and :c:func:`PyUnicode_ToFolded` public. +Make :c:func:`PyUCS4_ToLower`, :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle` and :c:func:`PyUCS4_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 21f6c0f62f11f5..c3106f0fcb8543 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,7 +221,8 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t)) +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), + Py_UCS4 *buf, Py_ssize_t size) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); @@ -235,8 +236,7 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); - Py_UCS4 buf[3]; - Py_ssize_t chars = function(c, buf, Py_ARRAY_LENGTH(buf)); + Py_ssize_t chars = function(c, buf, size); if (chars < 0) { return NULL; } @@ -244,33 +244,45 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); } -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToLower); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } -/* Test PyUnicode_ToUpper() */ + +/* Test PyUCS4_ToUpper() */ static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToUpper); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } +/* Test PyUCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf; + return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1); +} -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToTitle); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } -/* Test PyUnicode_ToLower() */ +/* Test PyUCS4_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - return unicode_case_operation(arg, PyUnicode_ToFolded); + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); } @@ -633,6 +645,7 @@ static PyMethodDef TestMethods[] = { {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, {"unicode_tolower", unicode_tolower, METH_O}, {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, {"unicode_totitle", unicode_totitle, METH_O}, {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index da70f60b12c450..aacfc316e2b960 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -224,7 +224,7 @@ Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -250,7 +250,7 @@ Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -276,7 +276,7 @@ Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return 1; } -Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); @@ -294,7 +294,7 @@ Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) return n; } - return PyUnicode_ToLower(ch, res, size); + return PyUCS4_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 50ccaac1bb6aea..57037c3e01ada8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10003,7 +10003,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUnicode_ToLower(c, mapped, mapped_size); + return PyUCS4_ToLower(c, mapped, mapped_size); } static Py_ssize_t @@ -10013,7 +10013,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10042,7 +10042,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; @@ -10069,7 +10069,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10099,7 +10099,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - Py_ssize_t j, n_res = PyUnicode_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10124,7 +10124,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUnicode_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From 3c964759bb25b4d225654a27d3e5dba1160718f9 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 29 Jul 2025 10:19:03 +0200 Subject: [PATCH 14/19] Address feedback --- Doc/c-api/unicode.rst | 31 +++++++++++++++++++++++-------- Tools/unicode/makeunicodedata.py | 1 + 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index e19ca45229b92e..9883a49b7a9093 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -310,46 +310,61 @@ These APIs can be used for fast direct character conversions: .. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased - (e.g. a maximum of two character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be lower cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be upper cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be title cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next .. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased - (e.g. a maximum of three character for Unicode 16.0), and + able to hold as many characters needed for *ch* to be foldcased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + .. versionadded:: next +.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, + :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or + :c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0. + +.. versionadded:: next These APIs can be used to work with surrogates: diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index ddd564deffd7e5..438f75a9628d99 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,6 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# * Doc/c-api-unicode.rst (in case conversion APIs) UNIDATA_VERSION = "17.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" From ef8264cfa52ebac5093b324aa60614dbff61e903 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 25 Sep 2025 17:41:43 +0200 Subject: [PATCH 15/19] Replace Py_UCS4 with (const Py_UCS4*, Py_ssize_t) --- Doc/c-api/unicode.rst | 40 +++--- Include/cpython/unicodeobject.h | 32 ++--- Lib/test/test_capi/test_unicode.py | 18 +-- Modules/_testcapi/unicode.c | 41 +++--- Objects/unicodectype.c | 197 ++++++++++++++++++----------- Objects/unicodeobject.c | 12 +- 6 files changed, 203 insertions(+), 137 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 9883a49b7a9093..c48de3e06c51e6 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,53 +307,61 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. -.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) - Convert *ch* to lower case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be lower cased, and + Convert *str* characters to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be lower cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. - In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be lowercased into a buffer of *buf_size* ``2``. See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. .. versionadded:: next -.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) - Convert *ch* to upper case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be upper cased, and + Convert *str* characters to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be upper cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. - In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be uppercased into a buffer of *buf_size* ``3``. See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. .. versionadded:: next -.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) - Convert *ch* to title case, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be title cased, and + Convert *str* characters to title case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be title cased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. - In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be titlecased into a buffer of *buf_size* ``3``. See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. .. versionadded:: next -.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) +.. c:function:: Py_ssize_t PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) - Foldcase *ch*, store result in *buffer*, which should be - able to hold as many characters needed for *ch* to be foldcased, and + Foldcase *str* characters, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be foldcased, and return the number of characters stored. If at some point a buffer overflow is detected, an :exc:`ValueError` is raised and ``-1`` is returned. - In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be foldcased into a buffer of *buf_size* ``3``. See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. .. versionadded:: next diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 662e3f5ab06dcf..94c60e043a1efc 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -734,28 +734,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( ); PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( - Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res, /* Output buffer */ - Py_ssize_t size /* Buffer size */ - ); + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( - Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res, /* Output buffer */ - Py_ssize_t size /* Buffer size */ - ); + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( - Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res, /* Output buffer */ - Py_ssize_t size /* Buffer size */ - ); + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( - Py_UCS4 ch, /* Unicode character */ - Py_UCS4 *res, /* Output buffer */ - Py_ssize_t size /* Buffer size */ - ); + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ // Helper array used by Py_UNICODE_ISSPACE(). diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 3a5d1a0053f351..6e6a37518d3f30 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,5 +1,6 @@ import unittest import sys +import string from test import support from test.support import threading_helper @@ -1756,31 +1757,29 @@ def test_GET_CACHED_HASH(self): @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_tolower(self): - import string from _testcapi import unicode_tolower - for i, c in enumerate(string.ascii_uppercase): - with self.subTest(c): - self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + self.assertEqual(unicode_tolower(string.ascii_uppercase), + string.ascii_lowercase) # Test unicode character self.assertEqual(unicode_tolower("Č"), "č") self.assertEqual(unicode_tolower("Σ"), "σ") + self.assertEqual(unicode_tolower("ABCΣ"), "abcσ") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') def test_toupper(self): - import string from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small - for i, c in enumerate(string.ascii_lowercase): - with self.subTest(c): - self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + self.assertEqual(unicode_toupper(string.ascii_lowercase), + string.ascii_uppercase) # Test unicode character self.assertEqual(unicode_toupper("č"), "Č") self.assertEqual(unicode_toupper("ß"), "SS") self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + self.assertEqual(unicode_toupper("abcß"), "ABCSS") # Test unicode character with smaller buffer with self.assertRaisesRegex(ValueError, "output buffer is too small"): @@ -1797,6 +1796,7 @@ def test_totitle(self): self.assertEqual(unicode_totitle("ł"), "Ł") self.assertEqual(unicode_totitle("ß"), "Ss") self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + self.assertEqual(unicode_totitle("abcß"), "ABCSs") @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') @@ -1808,6 +1808,8 @@ def test_tofolded(self): # Test unicode character self.assertEqual(unicode_tofolded("Ł"), "ł") self.assertEqual(unicode_tofolded("Σ"), "σ") + self.assertEqual(unicode_tofolded("abcΣ"), "abcσ") + self.assertEqual(unicode_tofolded("ABCσ"), "abcσ") # Test case-ignorable character self.assertEqual(unicode_tofolded("👍"), "👍") diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index c3106f0fcb8543..35baa1d3aaec66 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -221,22 +221,36 @@ unicode_copycharacters(PyObject *self, PyObject *args) } static PyObject * -unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), - Py_UCS4 *buf, Py_ssize_t size) +unicode_case_operation(PyObject *str, + Py_ssize_t (*function)(const Py_UCS4*, Py_ssize_t, Py_UCS4 *, Py_ssize_t), + int buf_too_small) { if (!PyUnicode_Check(str)) { PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); return NULL; } + Py_ssize_t len = PyUnicode_GET_LENGTH(str); - if (PyUnicode_GET_LENGTH(str) != 1) { - PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str); + if (ucs4 == NULL) { return NULL; } - Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); + Py_ssize_t buf_size; + if (!buf_too_small) { + buf_size = len * PyUCS4_CASE_CONVERSION_BUFFER_SIZE; + } + else { + buf_size = len * 1; + } + Py_UCS4 *buf = PyMem_Malloc(buf_size * sizeof(Py_UCS4)); + if (buf == NULL) { + PyMem_Free(ucs4); + return NULL; + } - Py_ssize_t chars = function(c, buf, size); + Py_ssize_t chars = function(ucs4, len, buf, buf_size); + PyMem_Free(ucs4); if (chars < 0) { return NULL; } @@ -248,8 +262,7 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, static PyObject * unicode_tolower(PyObject *self, PyObject *arg) { - Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; - return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); + return unicode_case_operation(arg, PyUCS4_ToLower, 0); } @@ -257,32 +270,28 @@ unicode_tolower(PyObject *self, PyObject *arg) static PyObject * unicode_toupper(PyObject *self, PyObject *arg) { - Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; - return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); + return unicode_case_operation(arg, PyUCS4_ToUpper, 0); } /* Test PyUCS4_ToUpper() with a small buffer */ static PyObject * unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) { - Py_UCS4 buf; - return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1); + return unicode_case_operation(arg, PyUCS4_ToUpper, 1); } /* Test PyUCS4_ToLower() */ static PyObject * unicode_totitle(PyObject *self, PyObject *arg) { - Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; - return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); + return unicode_case_operation(arg, PyUCS4_ToTitle, 0); } /* Test PyUCS4_ToLower() */ static PyObject * unicode_tofolded(PyObject *self, PyObject *arg) { - Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; - return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); + return unicode_case_operation(arg, PyUCS4_ToFolded, 0); } diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index aacfc316e2b960..6f0dce9638e93c 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,103 +198,150 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t +PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->lower & 0xFFFF; - int n = ctype->lower >> 24; - if (n > size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->lower & 0xFFFF; + n = ctype->lower >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->lower; } - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; - } - - if (size < 1) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->lower; - return 1; + return res; } -Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t +PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->title & 0xFFFF; - int n = ctype->title >> 24; - if (n > size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->title & 0xFFFF; + n = ctype->title >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->title; } - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; - } - - if (size < 1) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->title; - return 1; + return res; } -Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t +PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->upper & 0xFFFF; - int n = ctype->upper >> 24; - if (n > size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->upper & 0xFFFF; + n = ctype->upper >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->upper; } - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; - } - - if (size < 1) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->upper; - return 1; + return res; } -Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) +Py_ssize_t +PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { - int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); - int n = (ctype->lower >> 20) & 7; - if (n > size) { - PyErr_SetString(PyExc_ValueError, "output buffer is too small"); - return -1; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + Py_ssize_t n; + if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { + int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); + n = (ctype->lower >> 20) & 7; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (Py_ssize_t i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = PyUCS4_ToLower(&ch, 1, buf, buf_size); + if (n < 0) { + return -1; + } } - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; + buf += n; + buf_size -= n; + res += n; } - - return PyUCS4_ToLower(ch, res, size); + return res; } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 57037c3e01ada8..aa5773c1f0b3b7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10003,7 +10003,7 @@ lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return PyUCS4_ToLower(c, mapped, mapped_size); + return PyUCS4_ToLower(&c, 1, mapped, mapped_size); } static Py_ssize_t @@ -10013,7 +10013,7 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10042,7 +10042,7 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; @@ -10069,7 +10069,7 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, if (lower) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToUpper(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10099,7 +10099,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + Py_ssize_t j, n_res = PyUCS4_ToFolded(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); @@ -10124,7 +10124,7 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m if (previous_is_cased) n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + n_res = PyUCS4_ToTitle(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); From 10c164a5348018ecce3fd5292b83fe3b92229c06 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Thu, 11 Sep 2025 09:58:39 -0700 Subject: [PATCH 16/19] closes gh-138706: update Unicode to 17.0.0 (#138719) --- Doc/whatsnew/3.15.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index d5d387d9a0aaa7..ca3807eec96737 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -713,6 +713,12 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. +unicodedata +----------- + +* The Unicode database has been updated to Unicode 17.0.0. + + wave ---- From c67a22d249770350b61d3be729879f43edba6360 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 25 Sep 2025 20:17:57 +0200 Subject: [PATCH 17/19] Update Tools/unicode/makeunicodedata.py Co-authored-by: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> --- Tools/unicode/makeunicodedata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 438f75a9628d99..45c04e7b48312a 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,7 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) -# * Doc/c-api-unicode.rst (in case conversion APIs) +# * Doc/c-api/unicode.rst (in case conversion APIs) UNIDATA_VERSION = "17.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" From e0afd1dffbb103950c27ee0a4f8dbd5d916ea391 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 30 Sep 2025 14:57:55 +0200 Subject: [PATCH 18/19] Apply suggestions from code review Co-authored-by: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> --- Doc/c-api/unicode.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index c48de3e06c51e6..7173555e4393e3 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -312,9 +312,9 @@ These APIs can be used for fast direct character conversions: Convert *str* characters to lower case, store result in *buffer*, which should be able to hold as many characters needed for *str* to be lower cased, and return the number of characters stored. If at some point a buffer overflow - is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. - *str_size*, *buf_size* and the result are number of UCS-4 characters. + *str_size*, *buf_size* and the result are the number of UCS-4 characters. In Unicode 16.0, any character can be lowercased into a buffer of *buf_size* ``2``. See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. @@ -327,7 +327,7 @@ These APIs can be used for fast direct character conversions: Convert *str* characters to upper case, store result in *buffer*, which should be able to hold as many characters needed for *str* to be upper cased, and return the number of characters stored. If at some point a buffer overflow - is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. *str_size*, *buf_size* and the result are number of UCS-4 characters. @@ -342,7 +342,7 @@ These APIs can be used for fast direct character conversions: Convert *str* characters to title case, store result in *buffer*, which should be able to hold as many characters needed for *str* to be title cased, and return the number of characters stored. If at some point a buffer overflow - is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. *str_size*, *buf_size* and the result are number of UCS-4 characters. @@ -357,7 +357,7 @@ These APIs can be used for fast direct character conversions: Foldcase *str* characters, store result in *buffer*, which should be able to hold as many characters needed for *str* to be foldcased, and return the number of characters stored. If at some point a buffer overflow - is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. *str_size*, *buf_size* and the result are number of UCS-4 characters. @@ -366,6 +366,7 @@ These APIs can be used for fast direct character conversions: .. versionadded:: next + .. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, From 01e13e628f0960c736c78d45a1f3302090a6da3c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 30 Sep 2025 14:58:36 +0200 Subject: [PATCH 19/19] Update Modules/_testcapi/unicode.c Co-authored-by: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> --- Modules/_testcapi/unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 35baa1d3aaec66..80dcd3550f82e2 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -226,7 +226,7 @@ unicode_case_operation(PyObject *str, int buf_too_small) { if (!PyUnicode_Check(str)) { - PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + PyErr_Format(PyExc_TypeError, "expected type str, got %T", str); return NULL; } Py_ssize_t len = PyUnicode_GET_LENGTH(str);