Skip to content

Commit ef8264c

Browse files
committed
Replace Py_UCS4 with (const Py_UCS4*, Py_ssize_t)
1 parent 3c96475 commit ef8264c

File tree

6 files changed

+203
-137
lines changed

6 files changed

+203
-137
lines changed

Doc/c-api/unicode.rst

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -307,53 +307,61 @@ These APIs can be used for fast direct character conversions:
307307
possible. This function does not raise exceptions.
308308
309309
310-
.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
310+
.. c:function:: Py_ssize_t PyUCS4_ToLower(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
311311
312-
Convert *ch* to lower case, store result in *buffer*, which should be
313-
able to hold as many characters needed for *ch* to be lower cased, and
312+
Convert *str* characters to lower case, store result in *buffer*, which should be
313+
able to hold as many characters needed for *str* to be lower cased, and
314314
return the number of characters stored. If at some point a buffer overflow
315315
is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
316316
317-
In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``.
317+
*str_size*, *buf_size* and the result are number of UCS-4 characters.
318+
319+
In Unicode 16.0, any character can be lowercased into a buffer of *buf_size* ``2``.
318320
See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
319321
320322
.. versionadded:: next
321323
322324
323-
.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
325+
.. c:function:: Py_ssize_t PyUCS4_ToUpper(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
324326
325-
Convert *ch* to upper case, store result in *buffer*, which should be
326-
able to hold as many characters needed for *ch* to be upper cased, and
327+
Convert *str* characters to upper case, store result in *buffer*, which should be
328+
able to hold as many characters needed for *str* to be upper cased, and
327329
return the number of characters stored. If at some point a buffer overflow
328330
is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
329331
330-
In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``.
332+
*str_size*, *buf_size* and the result are number of UCS-4 characters.
333+
334+
In Unicode 16.0, any character can be uppercased into a buffer of *buf_size* ``3``.
331335
See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
332336
333337
.. versionadded:: next
334338
335339
336-
.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
340+
.. c:function:: Py_ssize_t PyUCS4_ToTitle(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
337341
338-
Convert *ch* to title case, store result in *buffer*, which should be
339-
able to hold as many characters needed for *ch* to be title cased, and
342+
Convert *str* characters to title case, store result in *buffer*, which should be
343+
able to hold as many characters needed for *str* to be title cased, and
340344
return the number of characters stored. If at some point a buffer overflow
341345
is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
342346
343-
In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``.
347+
*str_size*, *buf_size* and the result are number of UCS-4 characters.
348+
349+
In Unicode 16.0, any character can be titlecased into a buffer of *buf_size* ``3``.
344350
See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
345351
346352
.. versionadded:: next
347353
348354
349-
.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size)
355+
.. c:function:: Py_ssize_t PyUCS4_ToFolded(const Py_UCS4 *str, Py_ssize_t str_size, Py_UCS4 *buffer, Py_ssize_t buf_size)
350356
351-
Foldcase *ch*, store result in *buffer*, which should be
352-
able to hold as many characters needed for *ch* to be foldcased, and
357+
Foldcase *str* characters, store result in *buffer*, which should be
358+
able to hold as many characters needed for *str* to be foldcased, and
353359
return the number of characters stored. If at some point a buffer overflow
354360
is detected, an :exc:`ValueError` is raised and ``-1`` is returned.
355361
356-
In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``.
362+
*str_size*, *buf_size* and the result are number of UCS-4 characters.
363+
364+
In Unicode 16.0, any character can be foldcased into a buffer of *buf_size* ``3``.
357365
See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`.
358366
359367
.. versionadded:: next

Include/cpython/unicodeobject.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -734,28 +734,28 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
734734
);
735735

736736
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower(
737-
Py_UCS4 ch, /* Unicode character */
738-
Py_UCS4 *res, /* Output buffer */
739-
Py_ssize_t size /* Buffer size */
740-
);
737+
const Py_UCS4 *str, /* Unicode string */
738+
Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */
739+
Py_UCS4 *buf, /* Output buffer */
740+
Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */
741741

742742
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper(
743-
Py_UCS4 ch, /* Unicode character */
744-
Py_UCS4 *res, /* Output buffer */
745-
Py_ssize_t size /* Buffer size */
746-
);
743+
const Py_UCS4 *str, /* Unicode string */
744+
Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */
745+
Py_UCS4 *buf, /* Output buffer */
746+
Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */
747747

748748
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle(
749-
Py_UCS4 ch, /* Unicode character */
750-
Py_UCS4 *res, /* Output buffer */
751-
Py_ssize_t size /* Buffer size */
752-
);
749+
const Py_UCS4 *str, /* Unicode string */
750+
Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */
751+
Py_UCS4 *buf, /* Output buffer */
752+
Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */
753753

754754
PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded(
755-
Py_UCS4 ch, /* Unicode character */
756-
Py_UCS4 *res, /* Output buffer */
757-
Py_ssize_t size /* Buffer size */
758-
);
755+
const Py_UCS4 *str, /* Unicode string */
756+
Py_ssize_t str_size, /* Unicode string size (UCS-4 characters) */
757+
Py_UCS4 *buf, /* Output buffer */
758+
Py_ssize_t buf_size); /* Buffer size (UCS-4 characters) */
759759

760760

761761
// Helper array used by Py_UNICODE_ISSPACE().

Lib/test/test_capi/test_unicode.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import unittest
22
import sys
3+
import string
34
from test import support
45
from test.support import threading_helper
56

@@ -1756,31 +1757,29 @@ def test_GET_CACHED_HASH(self):
17561757
@support.cpython_only
17571758
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
17581759
def test_tolower(self):
1759-
import string
17601760
from _testcapi import unicode_tolower
17611761

1762-
for i, c in enumerate(string.ascii_uppercase):
1763-
with self.subTest(c):
1764-
self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i])
1762+
self.assertEqual(unicode_tolower(string.ascii_uppercase),
1763+
string.ascii_lowercase)
17651764

17661765
# Test unicode character
17671766
self.assertEqual(unicode_tolower("Č"), "č")
17681767
self.assertEqual(unicode_tolower("Σ"), "σ")
1768+
self.assertEqual(unicode_tolower("ABCΣ"), "abcσ")
17691769

17701770
@support.cpython_only
17711771
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
17721772
def test_toupper(self):
1773-
import string
17741773
from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small
17751774

1776-
for i, c in enumerate(string.ascii_lowercase):
1777-
with self.subTest(c):
1778-
self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i])
1775+
self.assertEqual(unicode_toupper(string.ascii_lowercase),
1776+
string.ascii_uppercase)
17791777

17801778
# Test unicode character
17811779
self.assertEqual(unicode_toupper("č"), "Č")
17821780
self.assertEqual(unicode_toupper("ß"), "SS")
17831781
self.assertEqual(unicode_toupper("ΐ"), "Ϊ́")
1782+
self.assertEqual(unicode_toupper("abcß"), "ABCSS")
17841783

17851784
# Test unicode character with smaller buffer
17861785
with self.assertRaisesRegex(ValueError, "output buffer is too small"):
@@ -1797,6 +1796,7 @@ def test_totitle(self):
17971796
self.assertEqual(unicode_totitle("ł"), "Ł")
17981797
self.assertEqual(unicode_totitle("ß"), "Ss")
17991798
self.assertEqual(unicode_totitle("ΐ"), "Ϊ́")
1799+
self.assertEqual(unicode_totitle("abcß"), "ABCSs")
18001800

18011801
@support.cpython_only
18021802
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
@@ -1808,6 +1808,8 @@ def test_tofolded(self):
18081808
# Test unicode character
18091809
self.assertEqual(unicode_tofolded("Ł"), "ł")
18101810
self.assertEqual(unicode_tofolded("Σ"), "σ")
1811+
self.assertEqual(unicode_tofolded("abcΣ"), "abcσ")
1812+
self.assertEqual(unicode_tofolded("ABCσ"), "abcσ")
18111813

18121814
# Test case-ignorable character
18131815
self.assertEqual(unicode_tofolded("👍"), "👍")

Modules/_testcapi/unicode.c

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -221,22 +221,36 @@ unicode_copycharacters(PyObject *self, PyObject *args)
221221
}
222222

223223
static PyObject *
224-
unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t),
225-
Py_UCS4 *buf, Py_ssize_t size)
224+
unicode_case_operation(PyObject *str,
225+
Py_ssize_t (*function)(const Py_UCS4*, Py_ssize_t, Py_UCS4 *, Py_ssize_t),
226+
int buf_too_small)
226227
{
227228
if (!PyUnicode_Check(str)) {
228229
PyErr_Format(PyExc_TypeError, "expect str type, got %T", str);
229230
return NULL;
230231
}
232+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
231233

232-
if (PyUnicode_GET_LENGTH(str) != 1) {
233-
PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only");
234+
Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str);
235+
if (ucs4 == NULL) {
234236
return NULL;
235237
}
236238

237-
Py_UCS4 c = PyUnicode_READ_CHAR(str, 0);
239+
Py_ssize_t buf_size;
240+
if (!buf_too_small) {
241+
buf_size = len * PyUCS4_CASE_CONVERSION_BUFFER_SIZE;
242+
}
243+
else {
244+
buf_size = len * 1;
245+
}
246+
Py_UCS4 *buf = PyMem_Malloc(buf_size * sizeof(Py_UCS4));
247+
if (buf == NULL) {
248+
PyMem_Free(ucs4);
249+
return NULL;
250+
}
238251

239-
Py_ssize_t chars = function(c, buf, size);
252+
Py_ssize_t chars = function(ucs4, len, buf, buf_size);
253+
PyMem_Free(ucs4);
240254
if (chars < 0) {
241255
return NULL;
242256
}
@@ -248,41 +262,36 @@ unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *,
248262
static PyObject *
249263
unicode_tolower(PyObject *self, PyObject *arg)
250264
{
251-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
252-
return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
265+
return unicode_case_operation(arg, PyUCS4_ToLower, 0);
253266
}
254267

255268

256269
/* Test PyUCS4_ToUpper() */
257270
static PyObject *
258271
unicode_toupper(PyObject *self, PyObject *arg)
259272
{
260-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
261-
return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
273+
return unicode_case_operation(arg, PyUCS4_ToUpper, 0);
262274
}
263275

264276
/* Test PyUCS4_ToUpper() with a small buffer */
265277
static PyObject *
266278
unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg)
267279
{
268-
Py_UCS4 buf;
269-
return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1);
280+
return unicode_case_operation(arg, PyUCS4_ToUpper, 1);
270281
}
271282

272283
/* Test PyUCS4_ToLower() */
273284
static PyObject *
274285
unicode_totitle(PyObject *self, PyObject *arg)
275286
{
276-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
277-
return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
287+
return unicode_case_operation(arg, PyUCS4_ToTitle, 0);
278288
}
279289

280290
/* Test PyUCS4_ToLower() */
281291
static PyObject *
282292
unicode_tofolded(PyObject *self, PyObject *arg)
283293
{
284-
Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE];
285-
return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE);
294+
return unicode_case_operation(arg, PyUCS4_ToFolded, 0);
286295
}
287296

288297

0 commit comments

Comments
 (0)