Address feedback; add size parameter and do PyUnicode_ToFolded as well

lysnikolaou · vstinner · commit b8384aebe136 · 2025-09-25T18:00:02.000+02:00
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -307,36 +307,51 @@ These APIs can be used for fast direct character conversions:
    possible.  This function does not raise exceptions.
 
 
-.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer)
+.. c:function:: Py_ssize_t PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, int size)
 
    Convert *ch* to lower case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be lower cased
-   (maximum three), and return the number of characters stored.
-   Passing a ``NULL`` buffer returns the buffer size needed.
+   able to hold as many characters needed for *ch* to be lower cased, and
+   return the number of characters stored. Passing a ``NULL`` buffer returns
+   the buffer size needed. If at some point a buffer overflow is detected,
+   an :exc:`OverflowError` is raised and ``-1`` is returned.
 
    .. versionadded:: next
 
 
-.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer)
+.. c:function:: Py_ssize_t PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, int size)
 
-   Convert *ch* to lower case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be lower cased
-   (maximum three), and return the number of characters stored.
-   Passing a ``NULL`` buffer returns the buffer size needed.
+   Convert *ch* to upper case, store result in *buffer*, which should be
+   able to hold as many characters needed for *ch* to be upper cased, and
+   return the number of characters stored. Passing a ``NULL`` buffer returns
+   the buffer size needed. If at some point a buffer overflow is detected,
+   an :exc:`OverflowError` is raised and ``-1`` is returned.
 
    .. versionadded:: next
 
 
-.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer)
+.. c:function:: Py_ssize_t PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, int size)
 
-   Convert *ch* to lower case, store result in *buffer*, which should be
-   able to hold as many characters needed for *ch* to be lower cased
-   (maximum three), and return the number of characters stored.
-   Passing a ``NULL`` buffer returns the buffer size needed.
+   Convert *ch* to title case, store result in *buffer*, which should be
+   able to hold as many characters needed for *ch* to be title cased, and
+   return the number of characters stored. Passing a ``NULL`` buffer returns
+   the buffer size needed. If at some point a buffer overflow is detected,
+   an :exc:`OverflowError` is raised and ``-1`` is returned.
+
+   .. versionadded:: next
+
+
+.. c:function:: Py_ssize_t PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, int size)
+
+   Foldcase *ch*, store result in *buffer*, which should be
+   able to hold as many characters needed for *ch* to be foldcased, and
+   return the number of characters stored. Passing a ``NULL`` buffer returns
+   the buffer size needed. If at some point a buffer overflow is detected,
+   an :exc:`OverflowError` is raised and ``-1`` is returned.
 
    .. versionadded:: next
 
 
+
 These APIs can be used to work with surrogates:
 
 .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch)
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
@@ -735,19 +735,29 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
 
 PyAPI_FUNC(int) PyUnicode_ToLower(
     Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res    /* Output buffer */
+    Py_UCS4 *res,   /* Output buffer */
+    int size        /* Buffer size */
     );
 
 PyAPI_FUNC(int) PyUnicode_ToUpper(
     Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res    /* Output buffer */
+    Py_UCS4 *res,   /* Output buffer */
+    int size        /* Buffer size */
     );
 
 PyAPI_FUNC(int) PyUnicode_ToTitle(
     Py_UCS4 ch,     /* Unicode character */
-    Py_UCS4 *res    /* Output buffer */
+    Py_UCS4 *res,   /* Output buffer */
+    int size        /* Buffer size */
     );
 
+PyAPI_FUNC(int) PyUnicode_ToFolded(
+    Py_UCS4 ch,     /* Unicode character */
+    Py_UCS4 *res,   /* Output buffer */
+    int size        /* Buffer size */
+    );
+
+
 // Helper array used by Py_UNICODE_ISSPACE().
 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
 
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
@@ -15,7 +15,6 @@ extern "C" {
 
 extern int _PyUnicode_IsXidStart(Py_UCS4 ch);
 extern int _PyUnicode_IsXidContinue(Py_UCS4 ch);
-extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res);
 extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch);
 extern int _PyUnicode_IsCased(Py_UCS4 ch);
 
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
@@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
     return ch + ctype->lower;
 }
 
-int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
+int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
@@ -208,19 +208,27 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)
         int i;
         for (i = 0; i < n; i++) {
             if (res != NULL) {
+                if (i >= size) {
+                    PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+                    return -1;
+                }
                 res[i] = _PyUnicode_ExtendedCase[index + i];
             }
         }
         return n;
     }
 
     if (res != NULL) {
+        if (0 >= size) {
+            PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+            return -1;
+        }
         res[0] = ch + ctype->lower;
     }
     return 1;
 }
 
-int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
+int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
@@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)
         int i;
         for (i = 0; i < n; i++) {
             if (res != NULL) {
+                if (i >= size) {
+                    PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+                    return -1;
+                }
                 res[i] = _PyUnicode_ExtendedCase[index + i];
             }
         }
         return n;
     }
     if (res != NULL) {
+        if (0 >= size) {
+            PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+            return -1;
+        }
         res[0] = ch + ctype->title;
     }
     return 1;
 }
 
-int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
+int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
@@ -251,30 +267,45 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)
         int i;
         for (i = 0; i < n; i++) {
             if (res != NULL) {
+                if (i >= size) {
+                    PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+                    return -1;
+                }
                 res[i] = _PyUnicode_ExtendedCase[index + i];
             }
         }
         return n;
     }
     if (res != NULL) {
+        if (0 >= size) {
+            PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+            return -1;
+        }
         res[0] = ch + ctype->upper;
     }
     return 1;
 }
 
-int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
         int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
         int n = (ctype->lower >> 20) & 7;
         int i;
-        for (i = 0; i < n; i++)
-            res[i] = _PyUnicode_ExtendedCase[index + i];
+        for (i = 0; i < n; i++) {
+            if (res != NULL) {
+                if (i >= size) {
+                    PyErr_SetString(PyExc_OverflowError, "output buffer is too small");
+                    return -1;
+                }
+                res[i] = _PyUnicode_ExtendedCase[index + i];
+            }
+        }
         return n;
     }
-    return PyUnicode_ToLowerFull(ch, res);
+    return PyUnicode_ToLower(ch, res, size);
 }
 
 int _PyUnicode_IsCased(Py_UCS4 ch)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -9996,14 +9996,14 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i
 
 static int
 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
-           Py_UCS4 c, Py_UCS4 *mapped)
+           Py_UCS4 c, Py_UCS4 *mapped, int mapped_size)
 {
     /* Obscure special case. */
     if (c == 0x3A3) {
         mapped[0] = handle_capital_sigma(kind, data, length, i);
         return 1;
     }
-    return PyUnicode_ToLower(c, mapped);
+    return PyUnicode_ToLower(c, mapped, mapped_size);
 }
 
 static Py_ssize_t
@@ -10014,14 +10014,14 @@ do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UC
     Py_UCS4 c, mapped[3];
 
     c = PyUnicode_READ(kind, data, 0);
-    n_res = PyUnicode_ToTitle(c, mapped);
+    n_res = PyUnicode_ToTitle(c, mapped, 3);
     for (j = 0; j < n_res; j++) {
         *maxchar = Py_MAX(*maxchar, mapped[j]);
         res[k++] = mapped[j];
     }
     for (i = 1; i < length; i++) {
         c = PyUnicode_READ(kind, data, i);
-        n_res = lower_ucs4(kind, data, length, i, c, mapped);
+        n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
         for (j = 0; j < n_res; j++) {
             *maxchar = Py_MAX(*maxchar, mapped[j]);
             res[k++] = mapped[j];
@@ -10038,10 +10038,10 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
         int n_res, j;
         if (Py_UNICODE_ISUPPER(c)) {
-            n_res = lower_ucs4(kind, data, length, i, c, mapped);
+            n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
         }
         else if (Py_UNICODE_ISLOWER(c)) {
-            n_res = PyUnicode_ToUpper(c, mapped);
+            n_res = PyUnicode_ToUpper(c, mapped, 3);
         }
         else {
             n_res = 1;
@@ -10065,9 +10065,9 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
         int n_res, j;
         if (lower)
-            n_res = lower_ucs4(kind, data, length, i, c, mapped);
+            n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
         else
-            n_res = PyUnicode_ToUpper(c, mapped);
+            n_res = PyUnicode_ToUpper(c, mapped, 3);
         for (j = 0; j < n_res; j++) {
             *maxchar = Py_MAX(*maxchar, mapped[j]);
             res[k++] = mapped[j];
@@ -10096,7 +10096,7 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4
     for (i = 0; i < length; i++) {
         Py_UCS4 c = PyUnicode_READ(kind, data, i);
         Py_UCS4 mapped[3];
-        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
+        int j, n_res = PyUnicode_ToFolded(c, mapped, 3);
         for (j = 0; j < n_res; j++) {
             *maxchar = Py_MAX(*maxchar, mapped[j]);
             res[k++] = mapped[j];
@@ -10118,9 +10118,9 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m
         int n_res, j;
 
         if (previous_is_cased)
-            n_res = lower_ucs4(kind, data, length, i, c, mapped);
+            n_res = lower_ucs4(kind, data, length, i, c, mapped, 3);
         else
-            n_res = PyUnicode_ToTitle(c, mapped);
+            n_res = PyUnicode_ToTitle(c, mapped, 3);
 
         for (j = 0; j < n_res; j++) {
             *maxchar = Py_MAX(*maxchar, mapped[j]);

Original file line number	Diff line number	Diff line change
`@@ -198,7 +198,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)`
`198`	`198`	`return ch + ctype->lower;`
`199`	`199`	`}`
`200`	`200`
`201`		`-int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)`
	`201`	`+int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res, int size)`
`202`	`202`	`{`
`203`	`203`	`const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);`
`204`	`204`
`@@ -208,19 +208,27 @@ int PyUnicode_ToLower(Py_UCS4 ch, Py_UCS4 *res)`
`208`	`208`	`int i;`
`209`	`209`	`for (i = 0; i < n; i++) {`
`210`	`210`	`if (res != NULL) {`
	`211`	`+ if (i >= size) {`
	`212`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`213`	`+ return -1;`
	`214`	`+ }`
`211`	`215`	`res[i] = _PyUnicode_ExtendedCase[index + i];`
`212`	`216`	`}`
`213`	`217`	`}`
`214`	`218`	`return n;`
`215`	`219`	`}`
`216`	`220`
`217`	`221`	`if (res != NULL) {`
	`222`	`+ if (0 >= size) {`
	`223`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`224`	`+ return -1;`
	`225`	`+ }`
`218`	`226`	`res[0] = ch + ctype->lower;`
`219`	`227`	`}`
`220`	`228`	`return 1;`
`221`	`229`	`}`
`222`	`230`
`223`		`-int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)`
	`231`	`+int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res, int size)`
`224`	`232`	`{`
`225`	`233`	`const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);`
`226`	`234`
`@@ -230,18 +238,26 @@ int PyUnicode_ToTitle(Py_UCS4 ch, Py_UCS4 *res)`
`230`	`238`	`int i;`
`231`	`239`	`for (i = 0; i < n; i++) {`
`232`	`240`	`if (res != NULL) {`
	`241`	`+ if (i >= size) {`
	`242`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`243`	`+ return -1;`
	`244`	`+ }`
`233`	`245`	`res[i] = _PyUnicode_ExtendedCase[index + i];`
`234`	`246`	`}`
`235`	`247`	`}`
`236`	`248`	`return n;`
`237`	`249`	`}`
`238`	`250`	`if (res != NULL) {`
	`251`	`+ if (0 >= size) {`
	`252`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`253`	`+ return -1;`
	`254`	`+ }`
`239`	`255`	`res[0] = ch + ctype->title;`
`240`	`256`	`}`
`241`	`257`	`return 1;`
`242`	`258`	`}`
`243`	`259`
`244`		`-int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)`
	`260`	`+int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res, int size)`
`245`	`261`	`{`
`246`	`262`	`const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);`
`247`	`263`
`@@ -251,30 +267,45 @@ int PyUnicode_ToUpper(Py_UCS4 ch, Py_UCS4 *res)`
`251`	`267`	`int i;`
`252`	`268`	`for (i = 0; i < n; i++) {`
`253`	`269`	`if (res != NULL) {`
	`270`	`+ if (i >= size) {`
	`271`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`272`	`+ return -1;`
	`273`	`+ }`
`254`	`274`	`res[i] = _PyUnicode_ExtendedCase[index + i];`
`255`	`275`	`}`
`256`	`276`	`}`
`257`	`277`	`return n;`
`258`	`278`	`}`
`259`	`279`	`if (res != NULL) {`
	`280`	`+ if (0 >= size) {`
	`281`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`282`	`+ return -1;`
	`283`	`+ }`
`260`	`284`	`res[0] = ch + ctype->upper;`
`261`	`285`	`}`
`262`	`286`	`return 1;`
`263`	`287`	`}`
`264`	`288`
`265`		`-int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)`
	`289`	`+int PyUnicode_ToFolded(Py_UCS4 ch, Py_UCS4 *res, int size)`
`266`	`290`	`{`
`267`	`291`	`const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);`
`268`	`292`
`269`	`293`	`if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {`
`270`	`294`	`int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);`
`271`	`295`	`int n = (ctype->lower >> 20) & 7;`
`272`	`296`	`int i;`
`273`		`- for (i = 0; i < n; i++)`
`274`		`- res[i] = _PyUnicode_ExtendedCase[index + i];`
	`297`	`+ for (i = 0; i < n; i++) {`
	`298`	`+ if (res != NULL) {`
	`299`	`+ if (i >= size) {`
	`300`	`+ PyErr_SetString(PyExc_OverflowError, "output buffer is too small");`
	`301`	`+ return -1;`
	`302`	`+ }`
	`303`	`+ res[i] = _PyUnicode_ExtendedCase[index + i];`
	`304`	`+ }`
	`305`	`+ }`
`275`	`306`	`return n;`
`276`	`307`	`}`
`277`		`- return PyUnicode_ToLowerFull(ch, res);`
	`308`	`+ return PyUnicode_ToLower(ch, res, size);`
`278`	`309`	`}`
`279`	`310`
`280`	`311`	`int _PyUnicode_IsCased(Py_UCS4 ch)`