diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 22b0a6aff6e02e..7173555e4393e3 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,75 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) + + Convert *str* characters to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be lower cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. + + *str_size*, *buf_size* and the result are the number of UCS-4 characters. + + In Unicode 16.0, any character can be lowercased into a buffer of *buf_size* ``2``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) + + Convert *str* characters to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be upper cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. + + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be uppercased into a buffer of *buf_size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) + + Convert *str* characters to title case, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be title cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. + + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be titlecased into a buffer of *buf_size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size) + + Foldcase *str* characters, store result in *buffer*, which should be + able to hold as many characters needed for *str* to be foldcased, and + return the number of characters stored. If at some point a buffer overflow + is detected, a :exc:`ValueError` is raised and ``-1`` is returned. + + *str_size*, *buf_size* and the result are number of UCS-4 characters. + + In Unicode 16.0, any character can be foldcased into a buffer of *buf_size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, + :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or + :c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0. + +.. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index d5d387d9a0aaa7..ca3807eec96737 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -713,6 +713,12 @@ unicodedata * The Unicode database has been updated to Unicode 17.0.0. +unicodedata +----------- + +* The Unicode database has been updated to Unicode 17.0.0. + + wave ---- diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 73e3bc44d6c9ca..94c60e043a1efc 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,31 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( + const Py_UCS4 *str, /* Unicode string */ + Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */ + Py_UCS4 *buf, /* Output buffer */ + Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */ + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; @@ -767,6 +792,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index c85c01da89a2ff..bd1a526f572216 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,10 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..6e6a37518d3f30 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1,5 +1,6 @@ import unittest import sys +import string from test import support from test.support import threading_helper @@ -1753,6 +1754,66 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + from _testcapi import unicode_tolower + + self.assertEqual(unicode_tolower(string.ascii_uppercase), + string.ascii_lowercase) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") + self.assertEqual(unicode_tolower("ABCΣ"), "abcσ") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small + + self.assertEqual(unicode_toupper(string.ascii_lowercase), + string.ascii_uppercase) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + self.assertEqual(unicode_toupper("abcß"), "ABCSS") + + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + self.assertEqual(unicode_totitle("abcß"), "ABCSs") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") + self.assertEqual(unicode_tofolded("abcΣ"), "abcσ") + self.assertEqual(unicode_tofolded("ABCσ"), "abcσ") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..37d251b6e35d8f --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUCS4_ToLower`, :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle` and :c:func:`PyUCS4_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..80dcd3550f82e2 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,81 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject * +unicode_case_operation(PyObject *str, + Py_ssize_t (*function)(const Py_UCS4*, Py_ssize_t, Py_UCS4 *, Py_ssize_t), + int buf_too_small) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expected type str, got %T", str); + return NULL; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str); + if (ucs4 == NULL) { + return NULL; + } + + Py_ssize_t buf_size; + if (!buf_too_small) { + buf_size = len * PyUCS4_CASE_CONVERSION_BUFFER_SIZE; + } + else { + buf_size = len * 1; + } + Py_UCS4 *buf = PyMem_Malloc(buf_size * sizeof(Py_UCS4)); + if (buf == NULL) { + PyMem_Free(ucs4); + return NULL; + } + + Py_ssize_t chars = function(ucs4, len, buf, buf_size); + PyMem_Free(ucs4); + if (chars < 0) { + return NULL; + } + + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUCS4_ToLower, 0); +} + + +/* Test PyUCS4_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUCS4_ToUpper, 0); +} + +/* Test PyUCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUCS4_ToUpper, 1); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUCS4_ToTitle, 0); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + return unicode_case_operation(arg, PyUCS4_ToFolded, 0); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +652,11 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..6f0dce9638e93c 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,67 +198,150 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t +PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->lower & 0xFFFF; - int n = ctype->lower >> 24; - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->lower & 0xFFFF; + n = ctype->lower >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->lower; + } + + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->lower; - return 1; + return res; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t +PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->title & 0xFFFF; - int n = ctype->title >> 24; - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->title & 0xFFFF; + n = ctype->title >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->title; + } + + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->title; - return 1; + return res; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t +PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK) { - int index = ctype->upper & 0xFFFF; - int n = ctype->upper >> 24; - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + int n; + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->upper & 0xFFFF; + n = ctype->upper >> 24; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (int i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = 1; + if (buf_size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + buf[0] = ch + ctype->upper; + } + + buf += n; + buf_size -= n; + res += n; } - res[0] = ch + ctype->upper; - return 1; + return res; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t +PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, + Py_UCS4 *buf, Py_ssize_t buf_size) { - const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - - if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { - int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); - int n = (ctype->lower >> 20) & 7; - int i; - for (i = 0; i < n; i++) - res[i] = _PyUnicode_ExtendedCase[index + i]; - return n; + Py_ssize_t res = 0; + for (Py_ssize_t i = 0; i < str_size; i++) { + Py_UCS4 ch = str[i]; + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + Py_ssize_t n; + if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { + int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); + n = (ctype->lower >> 20) & 7; + if (n > buf_size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + + for (Py_ssize_t i = 0; i < n; i++) { + buf[i] = _PyUnicode_ExtendedCase[index + i]; + } + } + else { + n = PyUCS4_ToLower(&ch, 1, buf, buf_size); + if (n < 0) { + return -1; + } + } + + buf += n; + buf_size -= n; + res += n; } - return _PyUnicode_ToLowerFull(ch, res); + return res; } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f6384afd1b209..aa5773c1f0b3b7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9994,34 +9994,35 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUCS4_ToLower(&c, 1, mapped, mapped_size); } static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUCS4_ToTitle(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10036,17 +10037,18 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10063,11 +10065,12 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10096,7 +10099,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + Py_ssize_t j, n_res = PyUCS4_ToFolded(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10115,13 +10119,13 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToTitleFull(c, mapped); - + n_res = PyUCS4_ToTitle(&c, 1, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index ddd564deffd7e5..45c04e7b48312a 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,6 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# * Doc/c-api/unicode.rst (in case conversion APIs) UNIDATA_VERSION = "17.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"