Skip to content

Commit c497694

Browse files
authored
gh-139156: Use PyBytesWriter in UTF-16 encoder (#139233)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
1 parent e578a9e commit c497694

File tree

1 file changed

+52
-52
lines changed

1 file changed

+52
-52
lines changed

Objects/unicodeobject.c

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6407,32 +6407,15 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64076407
const char *errors,
64086408
int byteorder)
64096409
{
6410-
int kind;
6411-
const void *data;
6412-
Py_ssize_t len;
6413-
PyObject *v;
6414-
unsigned short *out;
6415-
Py_ssize_t pairs;
6416-
#if PY_BIG_ENDIAN
6417-
int native_ordering = byteorder >= 0;
6418-
#else
6419-
int native_ordering = byteorder <= 0;
6420-
#endif
6421-
const char *encoding;
6422-
Py_ssize_t nsize, pos;
6423-
PyObject *errorHandler = NULL;
6424-
PyObject *exc = NULL;
6425-
PyObject *rep = NULL;
6426-
64276410
if (!PyUnicode_Check(str)) {
64286411
PyErr_BadArgument();
64296412
return NULL;
64306413
}
6431-
kind = PyUnicode_KIND(str);
6432-
data = PyUnicode_DATA(str);
6433-
len = PyUnicode_GET_LENGTH(str);
6414+
int kind = PyUnicode_KIND(str);
6415+
const void *data = PyUnicode_DATA(str);
6416+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
64346417

6435-
pairs = 0;
6418+
Py_ssize_t pairs = 0;
64366419
if (kind == PyUnicode_4BYTE_KIND) {
64376420
const Py_UCS4 *in = (const Py_UCS4 *)data;
64386421
const Py_UCS4 *end = in + len;
@@ -6445,27 +6428,48 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64456428
if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
64466429
return PyErr_NoMemory();
64476430
}
6448-
nsize = len + pairs + (byteorder == 0);
6449-
v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6450-
if (v == NULL) {
6431+
Py_ssize_t nsize = len + pairs + (byteorder == 0);
6432+
6433+
#if PY_BIG_ENDIAN
6434+
int native_ordering = byteorder >= 0;
6435+
#else
6436+
int native_ordering = byteorder <= 0;
6437+
#endif
6438+
6439+
if (kind == PyUnicode_1BYTE_KIND) {
6440+
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6441+
if (v == NULL) {
6442+
return NULL;
6443+
}
6444+
6445+
/* output buffer is 2-bytes aligned */
6446+
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6447+
unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6448+
if (byteorder == 0) {
6449+
*out++ = 0xFEFF;
6450+
}
6451+
if (len > 0) {
6452+
ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6453+
}
6454+
return v;
6455+
}
6456+
6457+
PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6458+
if (writer == NULL) {
64516459
return NULL;
64526460
}
64536461

64546462
/* output buffer is 2-bytes aligned */
6455-
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6456-
out = (unsigned short *)PyBytes_AS_STRING(v);
6463+
assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6464+
unsigned short *out = PyBytesWriter_GetData(writer);
64576465
if (byteorder == 0) {
64586466
*out++ = 0xFEFF;
64596467
}
64606468
if (len == 0) {
6461-
goto done;
6462-
}
6463-
6464-
if (kind == PyUnicode_1BYTE_KIND) {
6465-
ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6466-
goto done;
6469+
return PyBytesWriter_Finish(writer);
64676470
}
64686471

6472+
const char *encoding;
64696473
if (byteorder < 0) {
64706474
encoding = "utf-16-le";
64716475
}
@@ -6476,10 +6480,11 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64766480
encoding = "utf-16";
64776481
}
64786482

6479-
pos = 0;
6480-
while (pos < len) {
6481-
Py_ssize_t newpos, repsize, moreunits;
6483+
PyObject *errorHandler = NULL;
6484+
PyObject *exc = NULL;
6485+
PyObject *rep = NULL;
64826486

6487+
for (Py_ssize_t pos = 0; pos < len; ) {
64836488
if (kind == PyUnicode_2BYTE_KIND) {
64846489
pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
64856490
&out, native_ordering);
@@ -6492,13 +6497,15 @@ _PyUnicode_EncodeUTF16(PyObject *str,
64926497
if (pos == len)
64936498
break;
64946499

6500+
Py_ssize_t newpos;
64956501
rep = unicode_encode_call_errorhandler(
64966502
errors, &errorHandler,
64976503
encoding, "surrogates not allowed",
64986504
str, &exc, pos, pos + 1, &newpos);
64996505
if (!rep)
65006506
goto error;
65016507

6508+
Py_ssize_t repsize, moreunits;
65026509
if (PyBytes_Check(rep)) {
65036510
repsize = PyBytes_GET_SIZE(rep);
65046511
if (repsize & 1) {
@@ -6524,21 +6531,17 @@ _PyUnicode_EncodeUTF16(PyObject *str,
65246531

65256532
/* two bytes are reserved for each surrogate */
65266533
if (moreunits > 0) {
6527-
Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6528-
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6529-
/* integer overflow */
6530-
PyErr_NoMemory();
6534+
out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6535+
if (out == NULL) {
65316536
goto error;
65326537
}
6533-
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6534-
goto error;
6535-
out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
65366538
}
65376539

65386540
if (PyBytes_Check(rep)) {
65396541
memcpy(out, PyBytes_AS_STRING(rep), repsize);
65406542
out += repsize / 2;
6541-
} else /* rep is unicode */ {
6543+
} else {
6544+
/* rep is unicode */
65426545
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
65436546
ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
65446547
&out, native_ordering);
@@ -6547,23 +6550,20 @@ _PyUnicode_EncodeUTF16(PyObject *str,
65476550
Py_CLEAR(rep);
65486551
}
65496552

6553+
Py_XDECREF(errorHandler);
6554+
Py_XDECREF(exc);
6555+
65506556
/* Cut back to size actually needed. This is necessary for, for example,
65516557
encoding of a string containing isolated surrogates and the 'ignore' handler
65526558
is used. */
6553-
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6554-
if (nsize != PyBytes_GET_SIZE(v))
6555-
_PyBytes_Resize(&v, nsize);
6556-
Py_XDECREF(errorHandler);
6557-
Py_XDECREF(exc);
6558-
done:
6559-
return v;
6559+
return PyBytesWriter_FinishWithPointer(writer, out);
6560+
65606561
error:
65616562
Py_XDECREF(rep);
65626563
Py_XDECREF(errorHandler);
65636564
Py_XDECREF(exc);
6564-
Py_XDECREF(v);
6565+
PyBytesWriter_Discard(writer);
65656566
return NULL;
6566-
#undef STORECHAR
65676567
}
65686568

65696569
PyObject *

0 commit comments

Comments
 (0)