@@ -6089,61 +6089,73 @@ _PyUnicode_EncodeUTF32(PyObject *str,
60896089 const char * errors ,
60906090 int byteorder )
60916091{
6092- int kind ;
6093- const void * data ;
6094- Py_ssize_t len ;
6095- PyObject * v ;
6096- uint32_t * out ;
6092+ if (!PyUnicode_Check (str )) {
6093+ PyErr_BadArgument ();
6094+ return NULL ;
6095+ }
6096+ int kind = PyUnicode_KIND (str );
6097+ const void * data = PyUnicode_DATA (str );
6098+ Py_ssize_t len = PyUnicode_GET_LENGTH (str );
6099+
6100+ if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
6101+ return PyErr_NoMemory ();
6102+ Py_ssize_t nsize = len + (byteorder == 0 );
6103+
60976104#if PY_LITTLE_ENDIAN
60986105 int native_ordering = byteorder <= 0 ;
60996106#else
61006107 int native_ordering = byteorder >= 0 ;
61016108#endif
6102- const char * encoding ;
6103- Py_ssize_t nsize , pos ;
6104- PyObject * errorHandler = NULL ;
6105- PyObject * exc = NULL ;
6106- PyObject * rep = NULL ;
61076109
6108- if (!PyUnicode_Check (str )) {
6109- PyErr_BadArgument ();
6110- return NULL ;
6110+ if (kind == PyUnicode_1BYTE_KIND ) {
6111+ // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6112+ // on short strings
6113+ PyObject * v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
6114+ if (v == NULL ) {
6115+ return NULL ;
6116+ }
6117+
6118+ /* output buffer is 4-bytes aligned */
6119+ assert (_Py_IS_ALIGNED (PyBytes_AS_STRING (v ), 4 ));
6120+ uint32_t * out = (uint32_t * )PyBytes_AS_STRING (v );
6121+ if (byteorder == 0 ) {
6122+ * out ++ = 0xFEFF ;
6123+ }
6124+ if (len > 0 ) {
6125+ ucs1lib_utf32_encode ((const Py_UCS1 * )data , len ,
6126+ & out , native_ordering );
6127+ }
6128+ return v ;
61116129 }
6112- kind = PyUnicode_KIND (str );
6113- data = PyUnicode_DATA (str );
6114- len = PyUnicode_GET_LENGTH (str );
61156130
6116- if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0 ))
6117- return PyErr_NoMemory ();
6118- nsize = len + (byteorder == 0 );
6119- v = PyBytes_FromStringAndSize (NULL , nsize * 4 );
6120- if (v == NULL )
6131+ PyBytesWriter * writer = PyBytesWriter_Create (nsize * 4 );
6132+ if (writer == NULL ) {
61216133 return NULL ;
6134+ }
61226135
61236136 /* output buffer is 4-bytes aligned */
6124- assert (_Py_IS_ALIGNED (PyBytes_AS_STRING ( v ), 4 ));
6125- out = (uint32_t * )PyBytes_AS_STRING ( v );
6126- if (byteorder == 0 )
6137+ assert (_Py_IS_ALIGNED (PyBytesWriter_GetData ( writer ), 4 ));
6138+ uint32_t * out = (uint32_t * )PyBytesWriter_GetData ( writer );
6139+ if (byteorder == 0 ) {
61276140 * out ++ = 0xFEFF ;
6128- if (len == 0 )
6129- goto done ;
6141+ }
6142+ if (len == 0 ) {
6143+ return PyBytesWriter_Finish (writer );
6144+ }
61306145
6146+ const char * encoding ;
61316147 if (byteorder == -1 )
61326148 encoding = "utf-32-le" ;
61336149 else if (byteorder == 1 )
61346150 encoding = "utf-32-be" ;
61356151 else
61366152 encoding = "utf-32" ;
61376153
6138- if (kind == PyUnicode_1BYTE_KIND ) {
6139- ucs1lib_utf32_encode ((const Py_UCS1 * )data , len , & out , native_ordering );
6140- goto done ;
6141- }
6142-
6143- pos = 0 ;
6144- while (pos < len ) {
6145- Py_ssize_t newpos , repsize , moreunits ;
6154+ PyObject * errorHandler = NULL ;
6155+ PyObject * exc = NULL ;
6156+ PyObject * rep = NULL ;
61466157
6158+ for (Py_ssize_t pos = 0 ; pos < len ; ) {
61476159 if (kind == PyUnicode_2BYTE_KIND ) {
61486160 pos += ucs2lib_utf32_encode ((const Py_UCS2 * )data + pos , len - pos ,
61496161 & out , native_ordering );
@@ -6156,13 +6168,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
61566168 if (pos == len )
61576169 break ;
61586170
6171+ Py_ssize_t newpos ;
61596172 rep = unicode_encode_call_errorhandler (
61606173 errors , & errorHandler ,
61616174 encoding , "surrogates not allowed" ,
61626175 str , & exc , pos , pos + 1 , & newpos );
61636176 if (!rep )
61646177 goto error ;
61656178
6179+ Py_ssize_t repsize , moreunits ;
61666180 if (PyBytes_Check (rep )) {
61676181 repsize = PyBytes_GET_SIZE (rep );
61686182 if (repsize & 3 ) {
@@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
61886202
61896203 /* four bytes are reserved for each surrogate */
61906204 if (moreunits > 0 ) {
6191- Py_ssize_t outpos = out - (uint32_t * ) PyBytes_AS_STRING (v );
6192- if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE (v )) / 4 ) {
6193- /* integer overflow */
6194- PyErr_NoMemory ();
6205+ out = PyBytesWriter_GrowAndUpdatePointer (writer , 4 * moreunits , out );
6206+ if (out == NULL ) {
61956207 goto error ;
61966208 }
6197- if (_PyBytes_Resize (& v , PyBytes_GET_SIZE (v ) + 4 * moreunits ) < 0 )
6198- goto error ;
6199- out = (uint32_t * ) PyBytes_AS_STRING (v ) + outpos ;
62006209 }
62016210
62026211 if (PyBytes_Check (rep )) {
62036212 memcpy (out , PyBytes_AS_STRING (rep ), repsize );
62046213 out += repsize / 4 ;
6205- } else /* rep is unicode */ {
6214+ }
6215+ else {
6216+ /* rep is unicode */
62066217 assert (PyUnicode_KIND (rep ) == PyUnicode_1BYTE_KIND );
62076218 ucs1lib_utf32_encode (PyUnicode_1BYTE_DATA (rep ), repsize ,
62086219 & out , native_ordering );
@@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
62116222 Py_CLEAR (rep );
62126223 }
62136224
6225+ Py_XDECREF (errorHandler );
6226+ Py_XDECREF (exc );
6227+
62146228 /* Cut back to size actually needed. This is necessary for, for example,
62156229 encoding of a string containing isolated surrogates and the 'ignore'
62166230 handler is used. */
6217- nsize = (unsigned char * ) out - (unsigned char * ) PyBytes_AS_STRING (v );
6218- if (nsize != PyBytes_GET_SIZE (v ))
6219- _PyBytes_Resize (& v , nsize );
6220- Py_XDECREF (errorHandler );
6221- Py_XDECREF (exc );
6222- done :
6223- return v ;
6231+ return PyBytesWriter_FinishWithPointer (writer , out );
6232+
62246233 error :
62256234 Py_XDECREF (rep );
62266235 Py_XDECREF (errorHandler );
62276236 Py_XDECREF (exc );
6228- Py_XDECREF ( v );
6237+ PyBytesWriter_Discard ( writer );
62296238 return NULL ;
62306239}
62316240
0 commit comments