Skip to content

Commit 92ba2c9

Browse files
authored
gh-139156: Use PyBytesWriter in UTF-32 encoder (#139157)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
1 parent f0d8583 commit 92ba2c9

File tree

1 file changed

+59
-50
lines changed

1 file changed

+59
-50
lines changed

Objects/unicodeobject.c

Lines changed: 59 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -6089,61 +6089,73 @@ _PyUnicode_EncodeUTF32(PyObject *str,
60896089
const char *errors,
60906090
int byteorder)
60916091
{
6092-
int kind;
6093-
const void *data;
6094-
Py_ssize_t len;
6095-
PyObject *v;
6096-
uint32_t *out;
6092+
if (!PyUnicode_Check(str)) {
6093+
PyErr_BadArgument();
6094+
return NULL;
6095+
}
6096+
int kind = PyUnicode_KIND(str);
6097+
const void *data = PyUnicode_DATA(str);
6098+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6099+
6100+
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6101+
return PyErr_NoMemory();
6102+
Py_ssize_t nsize = len + (byteorder == 0);
6103+
60976104
#if PY_LITTLE_ENDIAN
60986105
int native_ordering = byteorder <= 0;
60996106
#else
61006107
int native_ordering = byteorder >= 0;
61016108
#endif
6102-
const char *encoding;
6103-
Py_ssize_t nsize, pos;
6104-
PyObject *errorHandler = NULL;
6105-
PyObject *exc = NULL;
6106-
PyObject *rep = NULL;
61076109

6108-
if (!PyUnicode_Check(str)) {
6109-
PyErr_BadArgument();
6110-
return NULL;
6110+
if (kind == PyUnicode_1BYTE_KIND) {
6111+
// gh-139156: Don't use PyBytesWriter API here since it has an overhead
6112+
// on short strings
6113+
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6114+
if (v == NULL) {
6115+
return NULL;
6116+
}
6117+
6118+
/* output buffer is 4-bytes aligned */
6119+
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6120+
uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
6121+
if (byteorder == 0) {
6122+
*out++ = 0xFEFF;
6123+
}
6124+
if (len > 0) {
6125+
ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
6126+
&out, native_ordering);
6127+
}
6128+
return v;
61116129
}
6112-
kind = PyUnicode_KIND(str);
6113-
data = PyUnicode_DATA(str);
6114-
len = PyUnicode_GET_LENGTH(str);
61156130

6116-
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6117-
return PyErr_NoMemory();
6118-
nsize = len + (byteorder == 0);
6119-
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6120-
if (v == NULL)
6131+
PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
6132+
if (writer == NULL) {
61216133
return NULL;
6134+
}
61226135

61236136
/* output buffer is 4-bytes aligned */
6124-
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6125-
out = (uint32_t *)PyBytes_AS_STRING(v);
6126-
if (byteorder == 0)
6137+
assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
6138+
uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
6139+
if (byteorder == 0) {
61276140
*out++ = 0xFEFF;
6128-
if (len == 0)
6129-
goto done;
6141+
}
6142+
if (len == 0) {
6143+
return PyBytesWriter_Finish(writer);
6144+
}
61306145

6146+
const char *encoding;
61316147
if (byteorder == -1)
61326148
encoding = "utf-32-le";
61336149
else if (byteorder == 1)
61346150
encoding = "utf-32-be";
61356151
else
61366152
encoding = "utf-32";
61376153

6138-
if (kind == PyUnicode_1BYTE_KIND) {
6139-
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6140-
goto done;
6141-
}
6142-
6143-
pos = 0;
6144-
while (pos < len) {
6145-
Py_ssize_t newpos, repsize, moreunits;
6154+
PyObject *errorHandler = NULL;
6155+
PyObject *exc = NULL;
6156+
PyObject *rep = NULL;
61466157

6158+
for (Py_ssize_t pos = 0; pos < len; ) {
61476159
if (kind == PyUnicode_2BYTE_KIND) {
61486160
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
61496161
&out, native_ordering);
@@ -6156,13 +6168,15 @@ _PyUnicode_EncodeUTF32(PyObject *str,
61566168
if (pos == len)
61576169
break;
61586170

6171+
Py_ssize_t newpos;
61596172
rep = unicode_encode_call_errorhandler(
61606173
errors, &errorHandler,
61616174
encoding, "surrogates not allowed",
61626175
str, &exc, pos, pos + 1, &newpos);
61636176
if (!rep)
61646177
goto error;
61656178

6179+
Py_ssize_t repsize, moreunits;
61666180
if (PyBytes_Check(rep)) {
61676181
repsize = PyBytes_GET_SIZE(rep);
61686182
if (repsize & 3) {
@@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
61886202

61896203
/* four bytes are reserved for each surrogate */
61906204
if (moreunits > 0) {
6191-
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
6192-
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
6193-
/* integer overflow */
6194-
PyErr_NoMemory();
6205+
out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6206+
if (out == NULL) {
61956207
goto error;
61966208
}
6197-
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
6198-
goto error;
6199-
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
62006209
}
62016210

62026211
if (PyBytes_Check(rep)) {
62036212
memcpy(out, PyBytes_AS_STRING(rep), repsize);
62046213
out += repsize / 4;
6205-
} else /* rep is unicode */ {
6214+
}
6215+
else {
6216+
/* rep is unicode */
62066217
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
62076218
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
62086219
&out, native_ordering);
@@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
62116222
Py_CLEAR(rep);
62126223
}
62136224

6225+
Py_XDECREF(errorHandler);
6226+
Py_XDECREF(exc);
6227+
62146228
/* Cut back to size actually needed. This is necessary for, for example,
62156229
encoding of a string containing isolated surrogates and the 'ignore'
62166230
handler is used. */
6217-
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6218-
if (nsize != PyBytes_GET_SIZE(v))
6219-
_PyBytes_Resize(&v, nsize);
6220-
Py_XDECREF(errorHandler);
6221-
Py_XDECREF(exc);
6222-
done:
6223-
return v;
6231+
return PyBytesWriter_FinishWithPointer(writer, out);
6232+
62246233
error:
62256234
Py_XDECREF(rep);
62266235
Py_XDECREF(errorHandler);
62276236
Py_XDECREF(exc);
6228-
Py_XDECREF(v);
6237+
PyBytesWriter_Discard(writer);
62296238
return NULL;
62306239
}
62316240

0 commit comments

Comments
 (0)