Replace Py_UCS4 with (const Py_UCS4*, Py_ssize_t)

vstinner · vstinner · commit ef8264cfa52e · 2025-09-25T18:00:31.000+02:00
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -307,53 +307,61 @@ These APIs can be used for fast direct character conversions:
    possible.  This function does not raise exceptions.
 
 
-.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
+.. c:function:: Py_ssize_t PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
 
-   Convert *ch* to lower case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be lower cased, and
+   Convert *str* characters to lower case, store result in *buffer*, which should be
+   able to hold as many characters needed for *str* to be lower cased, and
    return the number of characters stored. If at some point a buffer overflow
    is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
 
-   In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``.
+   *str_size*, *buf_size* and the result are number of UCS-4 characters.
+
+   In Unicode 16.0, any character can be lowercased into a buffer of *buf_size* ``2``.
    See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
 
    .. versionadded:: next
 
 
-.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
+.. c:function:: Py_ssize_t PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
 
-   Convert *ch* to upper case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be upper cased, and
+   Convert *str* characters to upper case, store result in *buffer*, which should be
+   able to hold as many characters needed for *str* to be upper cased, and
    return the number of characters stored. If at some point a buffer overflow
    is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
 
-   In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``.
+   *str_size*, *buf_size* and the result are number of UCS-4 characters.
+
+   In Unicode 16.0, any character can be uppercased into a buffer of *buf_size* ``3``.
    See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
 
    .. versionadded:: next
 
 
-.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
+.. c:function:: Py_ssize_t PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
 
-   Convert *ch* to title case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be title cased, and
+   Convert *str* characters to title case, store result in *buffer*, which should be
+   able to hold as many characters needed for *str* to be title cased, and
    return the number of characters stored. If at some point a buffer overflow
    is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
 
-   In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``.
+   *str_size*, *buf_size* and the result are number of UCS-4 characters.
+
+   In Unicode 16.0, any character can be titlecased into a buffer of *buf_size* ``3``.
    See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
 
    .. versionadded:: next
 
 
-.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
+.. c:function:: Py_ssize_t PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
 
-   Foldcase *ch*, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be foldcased, and
+   Foldcase *str* characters, store result in *buffer*, which should be
+   able to hold as many characters needed for *str* to be foldcased, and
    return the number of characters stored. If at some point a buffer overflow
    is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
 
-   In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``.
+   *str_size*, *buf_size* and the result are number of UCS-4 characters.
+
+   In Unicode 16.0, any character can be foldcased into a buffer of *buf_size* ``3``.
    See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
 
    .. versionadded:: next
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
@@ -734,28 +734,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
     );
 
 PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower(
-    Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res,   /* Output buffer */
-    Py_ssize_t size        /* Buffer size */
-    );
+    const Py_UCS4 *str,    /* Unicode string */
+    Py_ssize_t str_size,   /* Unicode string size (UCS-4 characters) */
+    Py_UCS4 *buf,          /* Output buffer */
+    Py_ssize_t buf_size);  /* Buffer size (UCS-4 characters) */
 
 PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper(
-    Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res,   /* Output buffer */
-    Py_ssize_t size        /* Buffer size */
-    );
+    const Py_UCS4 *str,    /* Unicode string */
+    Py_ssize_t str_size,   /* Unicode string size (UCS-4 characters) */
+    Py_UCS4 *buf,          /* Output buffer */
+    Py_ssize_t buf_size);  /* Buffer size (UCS-4 characters) */
 
 PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle(
-    Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res,   /* Output buffer */
-    Py_ssize_t size        /* Buffer size */
-    );
+    const Py_UCS4 *str,    /* Unicode string */
+    Py_ssize_t str_size,   /* Unicode string size (UCS-4 characters) */
+    Py_UCS4 *buf,          /* Output buffer */
+    Py_ssize_t buf_size);  /* Buffer size (UCS-4 characters) */
 
 PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded(
-    Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res,   /* Output buffer */
-    Py_ssize_t size        /* Buffer size */
-    );
+    const Py_UCS4 *str,    /* Unicode string */
+    Py_ssize_t str_size,   /* Unicode string size (UCS-4 characters) */
+    Py_UCS4 *buf,          /* Output buffer */
+    Py_ssize_t buf_size);  /* Buffer size (UCS-4 characters) */
 
 
 // Helper array used by Py_UNICODE_ISSPACE().
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -1,5 +1,6 @@
 import unittest
 import sys
+import string
 from test import support
 from test.support import threading_helper
 
@@ -1756,31 +1757,29 @@ def test_GET_CACHED_HASH(self):
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     def test_tolower(self):
-        import string
         from _testcapi import unicode_tolower
 
-        for i, c in enumerate(string.ascii_uppercase):
-            with self.subTest(c):
-                self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i])
+        self.assertEqual(unicode_tolower(string.ascii_uppercase),
+                         string.ascii_lowercase)
 
         # Test unicode character
         self.assertEqual(unicode_tolower("Č"), "č")
         self.assertEqual(unicode_tolower("Σ"), "σ")
+        self.assertEqual(unicode_tolower("ABCΣ"), "abcσ")
 
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     def test_toupper(self):
-        import string
         from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small
 
-        for i, c in enumerate(string.ascii_lowercase):
-            with self.subTest(c):
-                self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i])
+        self.assertEqual(unicode_toupper(string.ascii_lowercase),
+                         string.ascii_uppercase)
 
         # Test unicode character
         self.assertEqual(unicode_toupper("č"), "Č")
         self.assertEqual(unicode_toupper("ß"), "SS")
         self.assertEqual(unicode_toupper("ΐ"), "Ϊ́")
+        self.assertEqual(unicode_toupper("abcß"), "ABCSS")
 
         # Test unicode character with smaller buffer
         with self.assertRaisesRegex(ValueError, "output buffer is too small"):
@@ -1797,6 +1796,7 @@ def test_totitle(self):
         self.assertEqual(unicode_totitle("ł"), "Ł")
         self.assertEqual(unicode_totitle("ß"), "Ss")
         self.assertEqual(unicode_totitle("ΐ"), "Ϊ́")
+        self.assertEqual(unicode_totitle("abcß"), "ABCSs")
 
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
@@ -1808,6 +1808,8 @@ def test_tofolded(self):
         # Test unicode character
         self.assertEqual(unicode_tofolded("Ł"), "ł")
         self.assertEqual(unicode_tofolded("Σ"), "σ")
+        self.assertEqual(unicode_tofolded("abcΣ"), "abcσ")
+        self.assertEqual(unicode_tofolded("ABCσ"), "abcσ")
 
         # Test case-ignorable character
         self.assertEqual(unicode_tofolded("👍"), "👍")
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
@@ -221,22 +221,36 @@ unicode_copycharacters(PyObject *self, PyObject *args)
 }
 
 static PyObject *
-unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t),
-                       Py_UCS4 *buf, Py_ssize_t size)
+unicode_case_operation(PyObject *str,
+                       Py_ssize_t (*function)(const Py_UCS4*, Py_ssize_t, Py_UCS4 *, Py_ssize_t),
+                       int buf_too_small)
 {
     if (!PyUnicode_Check(str)) {
         PyErr_Format(PyExc_TypeError, "expect str type, got %T", str);
         return NULL;
     }
+    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
 
-    if (PyUnicode_GET_LENGTH(str) != 1) {
-        PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only");
+    Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);
+    if (ucs4 == NULL) {
         return NULL;
     }
 
-    Py_UCS4 c = PyUnicode_READ_CHAR(str, 0);
+    Py_ssize_t buf_size;
+    if (!buf_too_small) {
+        buf_size = len * PyUCS4_CASE_CONVERSION_BUFFER_SIZE;
+    }
+    else {
+        buf_size = len * 1;
+    }
+    Py_UCS4 *buf = PyMem_Malloc(buf_size * sizeof(Py_UCS4));
+    if (buf == NULL) {
+        PyMem_Free(ucs4);
+        return NULL;
+    }
 
-    Py_ssize_t chars = function(c, buf, size);
+    Py_ssize_t chars = function(ucs4, len, buf, buf_size);
+    PyMem_Free(ucs4);
     if (chars < 0) {
         return NULL;
     }
@@ -248,41 +262,36 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *,
 static PyObject *
 unicode_tolower(PyObject *self, PyObject *arg)
 {
-    Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
-    return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
+    return unicode_case_operation(arg, PyUCS4_ToLower, 0);
 }
 
 
 /* Test PyUCS4_ToUpper() */
 static PyObject *
 unicode_toupper(PyObject *self, PyObject *arg)
 {
-    Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
-    return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
+    return unicode_case_operation(arg, PyUCS4_ToUpper, 0);
 }
 
 /* Test PyUCS4_ToUpper() with a small buffer */
 static PyObject *
 unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg)
 {
-    Py_UCS4 buf;
-    return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1);
+    return unicode_case_operation(arg, PyUCS4_ToUpper, 1);
 }
 
 /* Test PyUCS4_ToLower() */
 static PyObject *
 unicode_totitle(PyObject *self, PyObject *arg)
 {
-    Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
-    return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
+    return unicode_case_operation(arg, PyUCS4_ToTitle, 0);
 }
 
 /* Test PyUCS4_ToLower() */
 static PyObject *
 unicode_tofolded(PyObject *self, PyObject *arg)
 {
-    Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
-    return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
+    return unicode_case_operation(arg, PyUCS4_ToFolded, 0);
 }
 
 
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

Original file line number	Diff line number	Diff line change
`@@ -221,22 +221,36 @@ unicode_copycharacters(PyObject self, PyObject args)`
`221`	`221`	`}`
`222`	`222`
`223`	`223`	`static PyObject *`
`224`		`-unicode_case_operation(PyObject str, Py_ssize_t (function)(Py_UCS4, Py_UCS4 *, Py_ssize_t),`
`225`		`- Py_UCS4 *buf, Py_ssize_t size)`
	`224`	`+unicode_case_operation(PyObject *str,`
	`225`	`+ Py_ssize_t (function)(const Py_UCS4, Py_ssize_t, Py_UCS4 *, Py_ssize_t),`
	`226`	`+ int buf_too_small)`
`226`	`227`	`{`
`227`	`228`	`if (!PyUnicode_Check(str)) {`
`228`	`229`	`PyErr_Format(PyExc_TypeError, "expect str type, got %T", str);`
`229`	`230`	`return NULL;`
`230`	`231`	`}`
	`232`	`+ Py_ssize_t len = PyUnicode_GET_LENGTH(str);`
`231`	`233`
`232`		`- if (PyUnicode_GET_LENGTH(str) != 1) {`
`233`		`- PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only");`
	`234`	`+ Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);`
	`235`	`+ if (ucs4 == NULL) {`
`234`	`236`	`return NULL;`
`235`	`237`	`}`
`236`	`238`
`237`		`- Py_UCS4 c = PyUnicode_READ_CHAR(str, 0);`
	`239`	`+ Py_ssize_t buf_size;`
	`240`	`+ if (!buf_too_small) {`
	`241`	`+ buf_size = len * PyUCS4_CASE_CONVERSION_BUFFER_SIZE;`
	`242`	`+ }`
	`243`	`+ else {`
	`244`	`+ buf_size = len * 1;`
	`245`	`+ }`
	`246`	`+ Py_UCS4 buf = PyMem_Malloc(buf_size sizeof(Py_UCS4));`
	`247`	`+ if (buf == NULL) {`
	`248`	`+ PyMem_Free(ucs4);`
	`249`	`+ return NULL;`
	`250`	`+ }`
`238`	`251`
`239`		`- Py_ssize_t chars = function(c, buf, size);`
	`252`	`+ Py_ssize_t chars = function(ucs4, len, buf, buf_size);`
	`253`	`+ PyMem_Free(ucs4);`
`240`	`254`	`if (chars < 0) {`
`241`	`255`	`return NULL;`
`242`	`256`	`}`
`@@ -248,41 +262,36 @@ unicode_case_operation(PyObject str, Py_ssize_t (function)(Py_UCS4, Py_UCS4 *,`
`248`	`262`	`static PyObject *`
`249`	`263`	`unicode_tolower(PyObject self, PyObject arg)`
`250`	`264`	`{`
`251`		`- Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];`
`252`		`- return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);`
	`265`	`+ return unicode_case_operation(arg, PyUCS4_ToLower, 0);`
`253`	`266`	`}`
`254`	`267`
`255`	`268`
`256`	`269`	`/* Test PyUCS4_ToUpper() */`
`257`	`270`	`static PyObject *`
`258`	`271`	`unicode_toupper(PyObject self, PyObject arg)`
`259`	`272`	`{`
`260`		`- Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];`
`261`		`- return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);`
	`273`	`+ return unicode_case_operation(arg, PyUCS4_ToUpper, 0);`
`262`	`274`	`}`
`263`	`275`
`264`	`276`	`/* Test PyUCS4_ToUpper() with a small buffer */`
`265`	`277`	`static PyObject *`
`266`	`278`	`unicode_toupper_buffer_too_small(PyObject self, PyObject arg)`
`267`	`279`	`{`
`268`		`- Py_UCS4 buf;`
`269`		`- return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1);`
	`280`	`+ return unicode_case_operation(arg, PyUCS4_ToUpper, 1);`
`270`	`281`	`}`
`271`	`282`
`272`	`283`	`/* Test PyUCS4_ToLower() */`
`273`	`284`	`static PyObject *`
`274`	`285`	`unicode_totitle(PyObject self, PyObject arg)`
`275`	`286`	`{`
`276`		`- Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];`
`277`		`- return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);`
	`287`	`+ return unicode_case_operation(arg, PyUCS4_ToTitle, 0);`
`278`	`288`	`}`
`279`	`289`
`280`	`290`	`/* Test PyUCS4_ToLower() */`
`281`	`291`	`static PyObject *`
`282`	`292`	`unicode_tofolded(PyObject self, PyObject arg)`
`283`	`293`	`{`
`284`		`- Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];`
`285`		`- return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);`
	`294`	`+ return unicode_case_operation(arg, PyUCS4_ToFolded, 0);`
`286`	`295`	`}`
`287`	`296`
`288`	`297`