Skip to content

Commit d604fc8

Browse files
committed
Address feedback; add size parameter and do PyUnicode_ToFolded as well
1 parent 431abba commit d604fc8

File tree

5 files changed

+91
-36
lines changed

5 files changed

+91
-36
lines changed

Doc/c-api/unicode.rst

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions:
307307
possible. This function does not raise exceptions.
308308
309309
310-
.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer)
310+
.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size)
311311
312312
Convert *ch* to lower case, store result in *buffer*, which should be
313-
able to hold as many characters needed for *ch* to be lower cased
314-
(maximum three), and return the number of characters stored.
315-
Passing a ``NULL`` buffer returns the buffer size needed.
313+
able to hold as many characters needed for *ch* to be lower cased, and
314+
return the number of characters stored. Passing a ``NULL`` buffer returns
315+
the buffer size needed. If at some point a buffer overflow is detected,
316+
an :exc:`OverflowError` is raised and ``-1`` is returned.
316317
317318
.. versionadded:: next
318319
319320
320-
.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer)
321+
.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size)
321322
322-
Convert *ch* to lower case, store result in *buffer*, which should be
323-
able to hold as many characters needed for *ch* to be lower cased
324-
(maximum three), and return the number of characters stored.
325-
Passing a ``NULL`` buffer returns the buffer size needed.
323+
Convert *ch* to upper case, store result in *buffer*, which should be
324+
able to hold as many characters needed for *ch* to be upper cased, and
325+
return the number of characters stored. Passing a ``NULL`` buffer returns
326+
the buffer size needed. If at some point a buffer overflow is detected,
327+
an :exc:`OverflowError` is raised and ``-1`` is returned.
326328
327329
.. versionadded:: next
328330
329331
330-
.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer)
332+
.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size)
331333
332-
Convert *ch* to lower case, store result in *buffer*, which should be
333-
able to hold as many characters needed for *ch* to be lower cased
334-
(maximum three), and return the number of characters stored.
335-
Passing a ``NULL`` buffer returns the buffer size needed.
334+
Convert *ch* to title case, store result in *buffer*, which should be
335+
able to hold as many characters needed for *ch* to be title cased, and
336+
return the number of characters stored. Passing a ``NULL`` buffer returns
337+
the buffer size needed. If at some point a buffer overflow is detected,
338+
an :exc:`OverflowError` is raised and ``-1`` is returned.
339+
340+
.. versionadded:: next
341+
342+
343+
.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size)
344+
345+
Foldcase *ch*, store result in *buffer*, which should be
346+
able to hold as many characters needed for *ch* to be foldcased, and
347+
return the number of characters stored. Passing a ``NULL`` buffer returns
348+
the buffer size needed. If at some point a buffer overflow is detected,
349+
an :exc:`OverflowError` is raised and ``-1`` is returned.
336350
337351
.. versionadded:: next
338352
339353
354+
340355
These APIs can be used to work with surrogates:
341356
342357
.. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch)

Include/cpython/unicodeobject.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
735735

736736
PyAPI_FUNC(int) PyUnicode_ToLower(
737737
Py_UCS4 ch, /* Unicode character */
738-
Py_UCS4 *res /* Output buffer */
738+
Py_UCS4 *res, /* Output buffer */
739+
int size /* Buffer size */
739740
);
740741

741742
PyAPI_FUNC(int) PyUnicode_ToUpper(
742743
Py_UCS4 ch, /* Unicode character */
743-
Py_UCS4 *res /* Output buffer */
744+
Py_UCS4 *res, /* Output buffer */
745+
int size /* Buffer size */
744746
);
745747

746748
PyAPI_FUNC(int) PyUnicode_ToTitle(
747749
Py_UCS4 ch, /* Unicode character */
748-
Py_UCS4 *res /* Output buffer */
750+
Py_UCS4 *res, /* Output buffer */
751+
int size /* Buffer size */
749752
);
750753

754+
PyAPI_FUNC(int) PyUnicode_ToFolded(
755+
Py_UCS4 ch, /* Unicode character */
756+
Py_UCS4 *res, /* Output buffer */
757+
int size /* Buffer size */
758+
);
759+
760+
751761
// Helper array used by Py_UNICODE_ISSPACE().
752762
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
753763

Include/internal/pycore_unicodeobject.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ extern "C" {
1515

1616
extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
1717
extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
18-
extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
1918
extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
2019
extern int _PyUnicode_IsCased(Py_UCS4 ch);
2120

Objects/unicodectype.c

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
198198
return ch + ctype->lower;
199199
}
200200

201-
int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
201+
int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size)
202202
{
203203
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
204204

@@ -208,19 +208,27 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
208208
int i;
209209
for (i = 0; i < n; i++) {
210210
if (res != NULL) {
211+
if (i >= size) {
212+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
213+
return -1;
214+
}
211215
res[i] = _PyUnicode_ExtendedCase[index + i];
212216
}
213217
}
214218
return n;
215219
}
216220

217221
if (res != NULL) {
222+
if (0 >= size) {
223+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
224+
return -1;
225+
}
218226
res[0] = ch + ctype->lower;
219227
}
220228
return 1;
221229
}
222230

223-
int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
231+
int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size)
224232
{
225233
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
226234

@@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
230238
int i;
231239
for (i = 0; i < n; i++) {
232240
if (res != NULL) {
241+
if (i >= size) {
242+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
243+
return -1;
244+
}
233245
res[i] = _PyUnicode_ExtendedCase[index + i];
234246
}
235247
}
236248
return n;
237249
}
238250
if (res != NULL) {
251+
if (0 >= size) {
252+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
253+
return -1;
254+
}
239255
res[0] = ch + ctype->title;
240256
}
241257
return 1;
242258
}
243259

244-
int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
260+
int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size)
245261
{
246262
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
247263

@@ -251,30 +267,45 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
251267
int i;
252268
for (i = 0; i < n; i++) {
253269
if (res != NULL) {
270+
if (i >= size) {
271+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
272+
return -1;
273+
}
254274
res[i] = _PyUnicode_ExtendedCase[index + i];
255275
}
256276
}
257277
return n;
258278
}
259279
if (res != NULL) {
280+
if (0 >= size) {
281+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
282+
return -1;
283+
}
260284
res[0] = ch + ctype->upper;
261285
}
262286
return 1;
263287
}
264288

265-
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
289+
int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size)
266290
{
267291
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
268292

269293
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
270294
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
271295
int n = (ctype->lower >> 20) & 7;
272296
int i;
273-
for (i = 0; i < n; i++)
274-
res[i] = _PyUnicode_ExtendedCase[index + i];
297+
for (i = 0; i < n; i++) {
298+
if (res != NULL) {
299+
if (i >= size) {
300+
PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
301+
return -1;
302+
}
303+
res[i] = _PyUnicode_ExtendedCase[index + i];
304+
}
305+
}
275306
return n;
276307
}
277-
return PyUnicode_ToLowerFull(ch, res);
308+
return PyUnicode_ToLower(ch, res, size);
278309
}
279310

280311
int _PyUnicode_IsCased(Py_UCS4 ch)

Objects/unicodeobject.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10039,14 +10039,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i
1003910039

1004010040
static int
1004110041
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
10042-
Py_UCS4 c, Py_UCS4 *mapped)
10042+
Py_UCS4 c, Py_UCS4 *mapped, int mapped_size)
1004310043
{
1004410044
/* Obscure special case. */
1004510045
if (c == 0x3A3) {
1004610046
mapped[0] = handle_capital_sigma(kind, data, length, i);
1004710047
return 1;
1004810048
}
10049-
return PyUnicode_ToLower(c, mapped);
10049+
return PyUnicode_ToLower(c, mapped, mapped_size);
1005010050
}
1005110051

1005210052
static Py_ssize_t
@@ -10057,14 +10057,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC
1005710057
Py_UCS4 c, mapped[3];
1005810058

1005910059
c = PyUnicode_READ(kind, data, 0);
10060-
n_res = PyUnicode_ToTitle(c, mapped);
10060+
n_res = PyUnicode_ToTitle(c, mapped, 3);
1006110061
for (j = 0; j < n_res; j++) {
1006210062
*maxchar = Py_MAX(*maxchar, mapped[j]);
1006310063
res[k++] = mapped[j];
1006410064
}
1006510065
for (i = 1; i < length; i++) {
1006610066
c = PyUnicode_READ(kind, data, i);
10067-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10067+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1006810068
for (j = 0; j < n_res; j++) {
1006910069
*maxchar = Py_MAX(*maxchar, mapped[j]);
1007010070
res[k++] = mapped[j];
@@ -10081,10 +10081,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
1008110081
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
1008210082
int n_res, j;
1008310083
if (Py_UNICODE_ISUPPER(c)) {
10084-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10084+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1008510085
}
1008610086
else if (Py_UNICODE_ISLOWER(c)) {
10087-
n_res = PyUnicode_ToUpper(c, mapped);
10087+
n_res = PyUnicode_ToUpper(c, mapped, 3);
1008810088
}
1008910089
else {
1009010090
n_res = 1;
@@ -10108,9 +10108,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
1010810108
Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
1010910109
int n_res, j;
1011010110
if (lower)
10111-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10111+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1011210112
else
10113-
n_res = PyUnicode_ToUpper(c, mapped);
10113+
n_res = PyUnicode_ToUpper(c, mapped, 3);
1011410114
for (j = 0; j < n_res; j++) {
1011510115
*maxchar = Py_MAX(*maxchar, mapped[j]);
1011610116
res[k++] = mapped[j];
@@ -10139,7 +10139,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
1013910139
for (i = 0; i < length; i++) {
1014010140
Py_UCS4 c = PyUnicode_READ(kind, data, i);
1014110141
Py_UCS4 mapped[3];
10142-
int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10142+
int j, n_res = PyUnicode_ToFolded(c, mapped, 3);
1014310143
for (j = 0; j < n_res; j++) {
1014410144
*maxchar = Py_MAX(*maxchar, mapped[j]);
1014510145
res[k++] = mapped[j];
@@ -10161,9 +10161,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m
1016110161
int n_res, j;
1016210162

1016310163
if (previous_is_cased)
10164-
n_res = lower_ucs4(kind, data, length, i, c, mapped);
10164+
n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
1016510165
else
10166-
n_res = PyUnicode_ToTitle(c, mapped);
10166+
n_res = PyUnicode_ToTitle(c, mapped, 3);
1016710167

1016810168
for (j = 0; j < n_res; j++) {
1016910169
*maxchar = Py_MAX(*maxchar, mapped[j]);

0 commit comments

Comments
 (0)