Skip to content

Commit b8384ae

Browse files
lysnikolaouvstinner
authored andcommitted
Address feedback; add size parameter and do PyUnicode_ToFolded as well
1 parent 5c44aca commit b8384ae

File tree

5 files changed

+91
-36
lines changed

5 files changed

+91
-36
lines changed

Doc/c-api/unicode.rst

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions:
307307
possible. This function does not raise exceptions.
308308
309309
310-
.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer)
310+
.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size)
311311
312312
Convert *ch* to lower case, store result in *buffer*, which should be
313-
able to hold as many characters needed for *ch* to be lower cased
314-
(maximum three), and return the number of characters stored.
315-
Passing a ``NULL`` buffer returns the buffer size needed.
313+
able to hold as many characters needed for *ch* to be lower cased, and
314+
return the number of characters stored. Passing a ``NULL`` buffer returns
315+
the buffer size needed. If at some point a buffer overflow is detected,
316+
an :exc:`OverflowError` is raised and ``-1`` is returned.
316317
317318
.. versionadded:: next
318319
319320
320-
.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer)
321+
.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size)
321322
322-
Convert *ch* to lower case, store result in *buffer*, which should be
323-
able to hold as many characters needed for *ch* to be lower cased
324-
(maximum three), and return the number of characters stored.
325-
Passing a ``NULL`` buffer returns the buffer size needed.
323+
Convert *ch* to upper case, store result in *buffer*, which should be
324+
able to hold as many characters needed for *ch* to be upper cased, and
325+
return the number of characters stored. Passing a ``NULL`` buffer returns
326+
the buffer size needed. If at some point a buffer overflow is detected,
327+
an :exc:`OverflowError` is raised and ``-1`` is returned.
326328
327329
.. versionadded:: next
328330
329331
330-
.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer)
332+
.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size)
331333
332-
Convert *ch* to lower case, store result in *buffer*, which should be
333-
able to hold as many characters needed for *ch* to be lower cased
334-
(maximum three), and return the number of characters stored.
335-
Passing a ``NULL`` buffer returns the buffer size needed.
334+
Convert *ch* to title case, store result in *buffer*, which should be
335+
able to hold as many characters needed for *ch* to be title cased, and
336+
return the number of characters stored. Passing a ``NULL`` buffer returns
337+
the buffer size needed. If at some point a buffer overflow is detected,
338+
an :exc:`OverflowError` is raised and ``-1`` is returned.
339+
340+
.. versionadded:: next
341+
342+
343+
.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size)
344+
345+
Foldcase *ch*, store result in *buffer*, which should be
346+
able to hold as many characters needed for *ch* to be foldcased, and
347+
return the number of characters stored. Passing a ``NULL`` buffer returns
348+
the buffer size needed. If at some point a buffer overflow is detected,
349+
an :exc:`OverflowError` is raised and ``-1`` is returned.
336350
337351
.. versionadded:: next
338352
339353
354+
340355
These APIs can be used to work with surrogates:
341356
342357
.. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch)

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
735735

736736
PyAPI_FUNC(int) PyUnicode_ToLower(
737737
Py_UCS4 ch, /* Unicode character */
738-
Py_UCS4 *res /* Output buffer */
738+
Py_UCS4 *res, /* Output buffer */
739+
int size /* Buffer size */
739740
);
740741

741742
PyAPI_FUNC(int) PyUnicode_ToUpper(
742743
Py_UCS4 ch, /* Unicode character */
743-
Py_UCS4 *res /* Output buffer */
744+
Py_UCS4 *res, /* Output buffer */
745+
int size /* Buffer size */
744746
);
745747

746748
PyAPI_FUNC(int) PyUnicode_ToTitle(
747749
Py_UCS4 ch, /* Unicode character */
748-
Py_UCS4 *res /* Output buffer */
750+
Py_UCS4 *res, /* Output buffer */
751+
int size /* Buffer size */
749752
);
750753

754+
PyAPI_FUNC(int) PyUnicode_ToFolded(
755+
Py_UCS4 ch, /* Unicode character */
756+
Py_UCS4 *res, /* Output buffer */
757+
int size /* Buffer size */
758+
);
759+
760+
751761
// Helper array used by Py_UNICODE_ISSPACE().
752762
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
753763

Include/internal/pycore_unicodeobject.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ extern "C" {
1515

1616
extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
1717
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
18-
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
1918
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
2019
extern int _PyUnicode_IsCased(Py_UCS4 ch);
2120

Objects/unicodectype.c

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
198198
return ch + ctype->lower;
199199
}
200200

201-
int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
201+
int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size)
202202
{
203203
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
204204

@@ -208,19 +208,27 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
208208
int i;
209209
for (i = 0; i < n; i++) {
210210
if (res != NULL) {
211+
if (i >= size) {
212+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
213+
return -1;
214+
}
211215
res[i] = _PyUnicode_ExtendedCase[index + i];
212216
}
213217
}
214218
return n;
215219
}
216220

217221
if (res != NULL) {
222+
if (0 >= size) {
223+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
224+
return -1;
225+
}
218226
res[0] = ch + ctype->lower;
219227
}
220228
return 1;
221229
}
222230

223-
int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
231+
int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size)
224232
{
225233
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
226234

@@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
230238
int i;
231239
for (i = 0; i < n; i++) {
232240
if (res != NULL) {
241+
if (i >= size) {
242+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
243+
return -1;
244+
}
233245
res[i] = _PyUnicode_ExtendedCase[index + i];
234246
}
235247
}
236248
return n;
237249
}
238250
if (res != NULL) {
251+
if (0 >= size) {
252+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
253+
return -1;
254+
}
239255
res[0] = ch + ctype->title;
240256
}
241257
return 1;
242258
}
243259

244-
int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
260+
int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size)
245261
{
246262
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
247263

@@ -251,30 +267,45 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
251267
int i;
252268
for (i = 0; i < n; i++) {
253269
if (res != NULL) {
270+
if (i >= size) {
271+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
272+
return -1;
273+
}
254274
res[i] = _PyUnicode_ExtendedCase[index + i];
255275
}
256276
}
257277
return n;
258278
}
259279
if (res != NULL) {
280+
if (0 >= size) {
281+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
282+
return -1;
283+
}
260284
res[0] = ch + ctype->upper;
261285
}
262286
return 1;
263287
}
264288

265-
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
289+
int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size)
266290
{
267291
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
268292

269293
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
270294
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
271295
int n = (ctype->lower >> 20) & 7;
272296
int i;
273-
for (i = 0; i < n; i++)
274-
res[i] = _PyUnicode_ExtendedCase[index + i];
297+
for (i = 0; i < n; i++) {
298+
if (res != NULL) {
299+
if (i >= size) {
300+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
301+
return -1;
302+
}
303+
res[i] = _PyUnicode_ExtendedCase[index + i];
304+
}
305+
}
275306
return n;
276307
}
277-
return PyUnicode_ToLowerFull(ch, res);
308+
return PyUnicode_ToLower(ch, res, size);
278309
}
279310

280311
int _PyUnicode_IsCased(Py_UCS4 ch)

Objects/unicodeobject.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9996,14 +9996,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i
99969996

99979997
static int
99989998
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9999-
Py_UCS4 c, Py_UCS4 *mapped)
9999+
Py_UCS4 c, Py_UCS4 *mapped, int mapped_size)
1000010000
{
1000110001
/* Obscure special case. */
1000210002
if (c == 0x3A3) {
1000310003
mapped[0] = handle_capital_sigma(kind, data, length, i);
1000410004
return 1;
1000510005
}
10006-
return PyUnicode_ToLower(c, mapped);
10006+
return PyUnicode_ToLower(c, mapped, mapped_size);
1000710007
}
1000810008

1000910009
static Py_ssize_t
@@ -10014,14 +10014,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC
1001410014
Py_UCS4 c, mapped[3];
1001510015

1001610016
c = PyUnicode_READ(kind, data, 0);
10017-
n_res = PyUnicode_ToTitle(c, mapped);
10017+
n_res = PyUnicode_ToTitle(c, mapped, 3);
1001810018
for (j = 0; j < n_res; j++) {
1001910019
*maxchar = Py_MAX(*maxchar, mapped[j]);
1002010020
res[k++] = mapped[j];
1002110021
}
1002210022
for (i = 1; i < length; i++) {
1002310023
c = PyUnicode_READ(kind, data, i);
10024-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10024+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1002510025
for (j = 0; j < n_res; j++) {
1002610026
*maxchar = Py_MAX(*maxchar, mapped[j]);
1002710027
res[k++] = mapped[j];
@@ -10038,10 +10038,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
1003810038
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
1003910039
int n_res, j;
1004010040
if (Py_UNICODE_ISUPPER(c)) {
10041-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10041+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1004210042
}
1004310043
else if (Py_UNICODE_ISLOWER(c)) {
10044-
n_res = PyUnicode_ToUpper(c, mapped);
10044+
n_res = PyUnicode_ToUpper(c, mapped, 3);
1004510045
}
1004610046
else {
1004710047
n_res = 1;
@@ -10065,9 +10065,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
1006510065
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
1006610066
int n_res, j;
1006710067
if (lower)
10068-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10068+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1006910069
else
10070-
n_res = PyUnicode_ToUpper(c, mapped);
10070+
n_res = PyUnicode_ToUpper(c, mapped, 3);
1007110071
for (j = 0; j < n_res; j++) {
1007210072
*maxchar = Py_MAX(*maxchar, mapped[j]);
1007310073
res[k++] = mapped[j];
@@ -10096,7 +10096,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
1009610096
for (i = 0; i < length; i++) {
1009710097
Py_UCS4 c = PyUnicode_READ(kind, data, i);
1009810098
Py_UCS4 mapped[3];
10099-
int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10099+
int j, n_res = PyUnicode_ToFolded(c, mapped, 3);
1010010100
for (j = 0; j < n_res; j++) {
1010110101
*maxchar = Py_MAX(*maxchar, mapped[j]);
1010210102
res[k++] = mapped[j];
@@ -10118,9 +10118,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m
1011810118
int n_res, j;
1011910119

1012010120
if (previous_is_cased)
10121-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10121+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1012210122
else
10123-
n_res = PyUnicode_ToTitle(c, mapped);
10123+
n_res = PyUnicode_ToTitle(c, mapped, 3);
1012410124

1012510125
for (j = 0; j < n_res; j++) {
1012610126
*maxchar = Py_MAX(*maxchar, mapped[j]);

0 commit comments

Comments
 (0)