@@ -6089,61 +6089,73 @@ _PyUnicode_EncodeUTF32(PyObject *str,
6089
6089
const char * errors ,
6090
6090
int byteorder )
6091
6091
{
6092
- int kind ;
6093
- const void * data ;
6094
- Py_ssize_t len ;
6095
- PyObject * v ;
6096
- uint32_t * out ;
6092
+ if (!PyUnicode_Check (str )) {
6093
+ PyErr_BadArgument ();
6094
+ return NULL ;
6095
+ }
6096
+ int kind = PyUnicode_KIND (str );
6097
+ const void * data = PyUnicode_DATA (str );
6098
+ Py_ssize_t len = PyUnicode_GET_LENGTH (str );
6099
+
6100
+ if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
6101
+ return PyErr_NoMemory ();
6102
+ Py_ssize_t nsize = len + (byteorder == 0 );
6103
+
6097
6104
#if PY_LITTLE_ENDIAN
6098
6105
int native_ordering = byteorder <= 0 ;
6099
6106
#else
6100
6107
int native_ordering = byteorder >= 0 ;
6101
6108
#endif
6102
- const char * encoding ;
6103
- Py_ssize_t nsize , pos ;
6104
- PyObject * errorHandler = NULL ;
6105
- PyObject * exc = NULL ;
6106
- PyObject * rep = NULL ;
6107
6109
6108
- if (!PyUnicode_Check (str )) {
6109
- PyErr_BadArgument ();
6110
- return NULL ;
6110
+ if (kind == PyUnicode_1BYTE_KIND ) {
6111
+ // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6112
+ // on short strings
6113
+ PyObject * v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
6114
+ if (v == NULL ) {
6115
+ return NULL ;
6116
+ }
6117
+
6118
+ /* output buffer is 4-bytes aligned */
6119
+ assert (_Py_IS_ALIGNED (PyBytes_AS_STRING (v ), 4 ));
6120
+ uint32_t * out = (uint32_t * )PyBytes_AS_STRING (v );
6121
+ if (byteorder == 0 ) {
6122
+ * out ++ = 0xFEFF ;
6123
+ }
6124
+ if (len > 0 ) {
6125
+ ucs1lib_utf32_encode ((const Py_UCS1 * )data , len ,
6126
+ & out , native_ordering );
6127
+ }
6128
+ return v ;
6111
6129
}
6112
- kind = PyUnicode_KIND (str );
6113
- data = PyUnicode_DATA (str );
6114
- len = PyUnicode_GET_LENGTH (str );
6115
6130
6116
- if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
6117
- return PyErr_NoMemory ();
6118
- nsize = len + (byteorder == 0 );
6119
- v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
6120
- if (v == NULL )
6131
+ PyBytesWriter * writer = PyBytesWriter_Create (nsize * 4 );
6132
+ if (writer == NULL ) {
6121
6133
return NULL ;
6134
+ }
6122
6135
6123
6136
/* output buffer is 4-bytes aligned */
6124
- assert (_Py_IS_ALIGNED (PyBytes_AS_STRING ( v ), 4 ));
6125
- out = (uint32_t * )PyBytes_AS_STRING ( v );
6126
- if (byteorder == 0 )
6137
+ assert (_Py_IS_ALIGNED (PyBytesWriter_GetData ( writer ), 4 ));
6138
+ uint32_t * out = (uint32_t * )PyBytesWriter_GetData ( writer );
6139
+ if (byteorder == 0 ) {
6127
6140
* out ++ = 0xFEFF ;
6128
- if (len == 0 )
6129
- goto done ;
6141
+ }
6142
+ if (len == 0 ) {
6143
+ return PyBytesWriter_Finish (writer );
6144
+ }
6130
6145
6146
+ const char * encoding ;
6131
6147
if (byteorder == -1 )
6132
6148
encoding = "utf-32-le" ;
6133
6149
else if (byteorder == 1 )
6134
6150
encoding = "utf-32-be" ;
6135
6151
else
6136
6152
encoding = "utf-32" ;
6137
6153
6138
- if (kind == PyUnicode_1BYTE_KIND ) {
6139
- ucs1lib_utf32_encode ((const Py_UCS1 * )data , len , & out , native_ordering );
6140
- goto done ;
6141
- }
6142
-
6143
- pos = 0 ;
6144
- while (pos < len ) {
6145
- Py_ssize_t newpos , repsize , moreunits ;
6154
+ PyObject * errorHandler = NULL ;
6155
+ PyObject * exc = NULL ;
6156
+ PyObject * rep = NULL ;
6146
6157
6158
+ for (Py_ssize_t pos = 0 ; pos < len ; ) {
6147
6159
if (kind == PyUnicode_2BYTE_KIND ) {
6148
6160
pos += ucs2lib_utf32_encode ((const Py_UCS2 * )data + pos , len - pos ,
6149
6161
& out , native_ordering );
@@ -6156,13 +6168,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
6156
6168
if (pos == len )
6157
6169
break ;
6158
6170
6171
+ Py_ssize_t newpos ;
6159
6172
rep = unicode_encode_call_errorhandler (
6160
6173
errors , & errorHandler ,
6161
6174
encoding , "surrogates not allowed" ,
6162
6175
str , & exc , pos , pos + 1 , & newpos );
6163
6176
if (!rep )
6164
6177
goto error ;
6165
6178
6179
+ Py_ssize_t repsize , moreunits ;
6166
6180
if (PyBytes_Check (rep )) {
6167
6181
repsize = PyBytes_GET_SIZE (rep );
6168
6182
if (repsize & 3 ) {
@@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
6188
6202
6189
6203
/* four bytes are reserved for each surrogate */
6190
6204
if (moreunits > 0 ) {
6191
- Py_ssize_t outpos = out - (uint32_t * ) PyBytes_AS_STRING (v );
6192
- if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE (v )) / 4 ) {
6193
- /* integer overflow */
6194
- PyErr_NoMemory ();
6205
+ out = PyBytesWriter_GrowAndUpdatePointer (writer , 4 * moreunits , out );
6206
+ if (out == NULL ) {
6195
6207
goto error ;
6196
6208
}
6197
- if (_PyBytes_Resize (& v , PyBytes_GET_SIZE (v ) + 4 * moreunits ) < 0 )
6198
- goto error ;
6199
- out = (uint32_t * ) PyBytes_AS_STRING (v ) + outpos ;
6200
6209
}
6201
6210
6202
6211
if (PyBytes_Check (rep )) {
6203
6212
memcpy (out , PyBytes_AS_STRING (rep ), repsize );
6204
6213
out += repsize / 4 ;
6205
- } else /* rep is unicode */ {
6214
+ }
6215
+ else {
6216
+ /* rep is unicode */
6206
6217
assert (PyUnicode_KIND (rep ) == PyUnicode_1BYTE_KIND );
6207
6218
ucs1lib_utf32_encode (PyUnicode_1BYTE_DATA (rep ), repsize ,
6208
6219
& out , native_ordering );
@@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
6211
6222
Py_CLEAR (rep );
6212
6223
}
6213
6224
6225
+ Py_XDECREF (errorHandler );
6226
+ Py_XDECREF (exc );
6227
+
6214
6228
/* Cut back to size actually needed. This is necessary for, for example,
6215
6229
encoding of a string containing isolated surrogates and the 'ignore'
6216
6230
handler is used. */
6217
- nsize = (unsigned char * ) out - (unsigned char * ) PyBytes_AS_STRING (v );
6218
- if (nsize != PyBytes_GET_SIZE (v ))
6219
- _PyBytes_Resize (& v , nsize );
6220
- Py_XDECREF (errorHandler );
6221
- Py_XDECREF (exc );
6222
- done :
6223
- return v ;
6231
+ return PyBytesWriter_FinishWithPointer (writer , out );
6232
+
6224
6233
error :
6225
6234
Py_XDECREF (rep );
6226
6235
Py_XDECREF (errorHandler );
6227
6236
Py_XDECREF (exc );
6228
- Py_XDECREF ( v );
6237
+ PyBytesWriter_Discard ( writer );
6229
6238
return NULL ;
6230
6239
}
6231
6240
0 commit comments