Skip to content

Commit d85f2ea

Browse files
committed
gh-129005: Move bytearray to use bytes as a buffer
1. add `tp_new` to guarantee `ob_bytes_head` is always set, often to the empty bytes singleton. 2. `ob_alloc` information is now redundant, added assertion to validate that, would it make sense to deprecate? 3. There's a lot of `bytearray` code very similar to `bytes` code, more could likely be just proxied to the `bytes` now. Here just focusing on the swap as that enables optimizations. 4. If `bytearray` is passed a single-reference bytes it could potentially take "ownership" of it without copying the bytes, for now not implemented. This enables adding `bytearray._detach()` which I plan to do in a separate PR. ```bash # `_io` large file read test ./python -m test -M8g -uall test_largefile -m test.test_largefile.CLargeFileTest.test_large_read # `_pyio` large file read test ./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read ``` On my machine (AMD 64 bit Linux, Optimized build): `_io` takes: ~0.791s and uses ~2GB of RAM `_pyio` current: ~1.073s and uses ~4GB of RAM `_pyio` w/ bytearray._detach: ~0.887s and uses ~2GB of RAM Perf checking no major swings in an optimized build: `./python -E -bb -Wd -m test -uall -M32G test_bytes test_capi.test_bytearray -vvv` before: ~1.4s after: ~1.5s Previous discussion: https://discuss.python.org/t/add-zero-copy-conversion-of-bytearray-to-bytes-by-providing-bytes/79164
1 parent 0ef4ffe commit d85f2ea

File tree

4 files changed

+65
-40
lines changed

4 files changed

+65
-40
lines changed

Include/cpython/bytearrayobject.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ typedef struct {
99
char *ob_bytes; /* Physical backing buffer */
1010
char *ob_start; /* Logical start inside ob_bytes */
1111
Py_ssize_t ob_exports; /* How many buffer exports */
12+
PyObject *ob_bytes_head; /* bytes enabling zero-copy detach. */
1213
} PyByteArrayObject;
1314

1415
PyAPI_DATA(char) _PyByteArray_empty_string[];

Lib/test/test_sys.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1547,7 +1547,7 @@ def test_objecttypes(self):
15471547
samples = [b'', b'u'*100000]
15481548
for sample in samples:
15491549
x = bytearray(sample)
1550-
check(x, vsize('n2Pi') + x.__alloc__())
1550+
check(x, vsize('n2PiP') + x.__alloc__())
15511551
# bytearray_iterator
15521552
check(iter(bytearray()), size('nP'))
15531553
# bytes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Move :class:`bytearray` to use a buffer convertable to :class:`bytes` without copying.

Objects/bytearrayobject.c

Lines changed: 62 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ class bytearray "PyByteArrayObject *" "&PyByteArray_Type"
2121
char _PyByteArray_empty_string[] = "";
2222

2323
/* Helpers */
24+
static void
25+
bytearray_set_bytes(PyByteArrayObject *self, PyObject *bytes, Py_ssize_t size)
26+
{
27+
/* Always point to a valid bytes object */
28+
assert(bytes);
29+
assert(PyBytes_CheckExact(bytes));
30+
self->ob_bytes_head = bytes;
31+
self->ob_bytes = self->ob_start = PyBytes_AS_STRING(bytes);
32+
33+
Py_ssize_t alloc = PyBytes_GET_SIZE(bytes);
34+
alloc = (alloc == 0 ? alloc : alloc + 1);
35+
FT_ATOMIC_STORE_SSIZE_RELAXED(self->ob_alloc, alloc);
36+
Py_SET_SIZE(self, size);
37+
}
2438

2539
static int
2640
_getbytevalue(PyObject* arg, int *value)
@@ -127,7 +141,6 @@ PyObject *
127141
PyByteArray_FromStringAndSize(const char *bytes, Py_ssize_t size)
128142
{
129143
PyByteArrayObject *new;
130-
Py_ssize_t alloc;
131144

132145
if (size < 0) {
133146
PyErr_SetString(PyExc_SystemError,
@@ -141,27 +154,17 @@ PyByteArray_FromStringAndSize(const char *bytes, Py_ssize_t size)
141154
}
142155

143156
new = PyObject_New(PyByteArrayObject, &PyByteArray_Type);
144-
if (new == NULL)
157+
if (new == NULL) {
145158
return NULL;
146-
147-
if (size == 0) {
148-
new->ob_bytes = NULL;
149-
alloc = 0;
150159
}
151-
else {
152-
alloc = size + 1;
153-
new->ob_bytes = PyMem_Malloc(alloc);
154-
if (new->ob_bytes == NULL) {
155-
Py_DECREF(new);
156-
return PyErr_NoMemory();
157-
}
158-
if (bytes != NULL && size > 0)
159-
memcpy(new->ob_bytes, bytes, size);
160-
new->ob_bytes[size] = '\0'; /* Trailing null byte */
160+
161+
PyObject *buffer = PyBytes_FromStringAndSize(bytes, size);
162+
if (buffer == NULL) {
163+
Py_DECREF(new);
164+
return NULL;
161165
}
162-
Py_SET_SIZE(new, size);
163-
new->ob_alloc = alloc;
164-
new->ob_start = new->ob_bytes;
166+
167+
bytearray_set_bytes(new, buffer, size);
165168
new->ob_exports = 0;
166169

167170
return (PyObject *)new;
@@ -189,7 +192,6 @@ static int
189192
bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size)
190193
{
191194
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(self);
192-
void *sval;
193195
PyByteArrayObject *obj = ((PyByteArrayObject *)self);
194196
/* All computations are done unsigned to avoid integer overflows
195197
(see issue #22335). */
@@ -245,26 +247,27 @@ bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size)
245247
}
246248

247249
if (logical_offset > 0) {
248-
sval = PyMem_Malloc(alloc);
249-
if (sval == NULL) {
250-
PyErr_NoMemory();
250+
/* Make a new non-offset buffer */
251+
PyObject *new = PyBytes_FromStringAndSize(NULL, alloc - 1);
252+
if (new == NULL) {
251253
return -1;
252254
}
253-
memcpy(sval, PyByteArray_AS_STRING(self),
255+
memcpy(PyBytes_AS_STRING(new), PyByteArray_AS_STRING(self),
254256
Py_MIN((size_t)requested_size, (size_t)Py_SIZE(self)));
255-
PyMem_Free(obj->ob_bytes);
257+
Py_DECREF(obj->ob_bytes_head);
258+
bytearray_set_bytes(obj, new, size);
259+
obj->ob_bytes[size] = '\0'; /* Trailing null byte */
260+
261+
return 0;
256262
}
257-
else {
258-
sval = PyMem_Realloc(obj->ob_bytes, alloc);
259-
if (sval == NULL) {
260-
PyErr_NoMemory();
261-
return -1;
262-
}
263+
264+
/* Too small, grow allocation */
265+
if (_PyBytes_Resize(&obj->ob_bytes_head, alloc - 1) < 0) {
266+
bytearray_set_bytes(obj, PyBytes_FromStringAndSize(NULL, 0), 0);
267+
return -1;
263268
}
264269

265-
obj->ob_bytes = obj->ob_start = sval;
266-
Py_SET_SIZE(self, size);
267-
FT_ATOMIC_STORE_SSIZE_RELAXED(obj->ob_alloc, alloc);
270+
bytearray_set_bytes(obj, obj->ob_bytes_head, size);
268271
obj->ob_bytes[size] = '\0'; /* Trailing null byte */
269272

270273
return 0;
@@ -870,6 +873,19 @@ bytearray_ass_subscript(PyObject *op, PyObject *index, PyObject *values)
870873
return ret;
871874
}
872875

876+
static PyObject *
877+
bytearray_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
878+
{
879+
PyObject *obj = subtype->tp_alloc(subtype, 0);
880+
if (obj == NULL) {
881+
return NULL;
882+
}
883+
884+
PyByteArrayObject *self = _PyByteArray_CAST(obj);
885+
bytearray_set_bytes(self, PyBytes_FromStringAndSize(NULL, 0), 0);
886+
return obj;
887+
}
888+
873889
/*[clinic input]
874890
bytearray.__init__
875891
@@ -1233,9 +1249,7 @@ bytearray_dealloc(PyObject *op)
12331249
"deallocated bytearray object has exported buffers");
12341250
PyErr_Print();
12351251
}
1236-
if (self->ob_bytes != 0) {
1237-
PyMem_Free(self->ob_bytes);
1238-
}
1252+
Py_XDECREF(self->ob_bytes_head);
12391253
Py_TYPE(self)->tp_free((PyObject *)self);
12401254
}
12411255

@@ -2452,7 +2466,16 @@ static PyObject *
24522466
bytearray_alloc(PyObject *op, PyObject *Py_UNUSED(ignored))
24532467
{
24542468
PyByteArrayObject *self = _PyByteArray_CAST(op);
2455-
return PyLong_FromSsize_t(FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc));
2469+
2470+
Py_ssize_t size = PyBytes_GET_SIZE(self->ob_bytes_head);
2471+
Py_ssize_t alloc = (size == 0 ? size : size + 1);
2472+
2473+
#ifndef NDEBUG
2474+
/* Validate redundant information is redundant */
2475+
Py_ssize_t ob_alloc = FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc);
2476+
assert(alloc == ob_alloc);
2477+
#endif
2478+
return PyLong_FromSsize_t(alloc);
24562479
}
24572480

24582481
/*[clinic input]
@@ -2820,7 +2843,7 @@ PyTypeObject PyByteArray_Type = {
28202843
0, /* tp_dictoffset */
28212844
(initproc)bytearray___init__, /* tp_init */
28222845
PyType_GenericAlloc, /* tp_alloc */
2823-
PyType_GenericNew, /* tp_new */
2846+
bytearray_new, /* tp_new */
28242847
PyObject_Free, /* tp_free */
28252848
.tp_version_tag = _Py_TYPE_VERSION_BYTEARRAY,
28262849
};

0 commit comments

Comments
 (0)