Skip to content

Commit 688ea0c

Browse files
authored
[mypyc] Add minimal BytesWriter class librt (#20297)
The `librt.strings.BytesWriter` class allow building a `bytes` object from `bytes` or `u8` components. Simple example use case: ``` from librt.strings import BytesWriter w = BytesWriter() w.write(b'foo') w.append(ord('!')) print(w.getvalue()) # b'foo!' ``` It can be used at an alternative to `bytearray` or `BytesIO` in various use cases. The main benefit over stdlib classes is that we can provide (more) efficient mypyc primitives for various operations. The goal is to make it more efficient than either `bytearray` or `BytesIO` in many use cases, eventually. The implementation is based on the implementation of `librt.internal.WriteBuffer`, but it is somewhat different, since it will support also direct indexed item access (a bit like `bytearray`), so just appending to the end is not the only important use case. Direct item access is not implemented yet, but I'll add more features in follow-up PRs. This minimal implementation hasn't been optimized yet, and the API may not be final. It's experimental and only compiled if experimental features are explicitly enabled, so we can still make arbitrary API and ABI changes.
1 parent 14eaeb8 commit 688ea0c

File tree

13 files changed

+628
-2
lines changed

13 files changed

+628
-2
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from typing import final
2+
3+
@final
4+
class BytesWriter:
5+
def append(self, /, x: int) -> None: ...
6+
def write(self, /, b: bytes) -> None: ...
7+
def getvalue(self) -> bytes: ...

mypyc/build.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class ModDesc(NamedTuple):
5252

5353
LIBRT_MODULES = [
5454
ModDesc("librt.internal", ["librt_internal.c"], [], []),
55+
ModDesc("librt.strings", ["librt_strings.c"], [], []),
5556
ModDesc(
5657
"librt.base64",
5758
[

mypyc/codegen/emitmodule.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,8 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
611611
ext_declarations.emit_line("#include <librt_internal.h>")
612612
if any("librt.base64" in mod.capsules for mod in self.modules.values()):
613613
ext_declarations.emit_line("#include <librt_base64.h>")
614+
if any("librt.strings" in mod.capsules for mod in self.modules.values()):
615+
ext_declarations.emit_line("#include <librt_strings.h>")
614616

615617
declarations = Emitter(self.context)
616618
declarations.emit_line(f"#ifndef MYPYC_LIBRT_INTERNAL{self.group_suffix}_H")
@@ -1045,6 +1047,10 @@ def emit_module_exec_func(
10451047
emitter.emit_line("if (import_librt_base64() < 0) {")
10461048
emitter.emit_line("return -1;")
10471049
emitter.emit_line("}")
1050+
if "librt.strings" in module.capsules:
1051+
emitter.emit_line("if (import_librt_strings() < 0) {")
1052+
emitter.emit_line("return -1;")
1053+
emitter.emit_line("}")
10481054
emitter.emit_line("PyObject* modname = NULL;")
10491055
if self.multi_phase_init:
10501056
emitter.emit_line(f"{module_static} = module;")

mypyc/ir/rtypes.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,11 @@ def __hash__(self) -> int:
514514

515515
KNOWN_NATIVE_TYPES: Final = {
516516
name: RPrimitive(name, is_unboxed=False, is_refcounted=True)
517-
for name in ["librt.internal.WriteBuffer", "librt.internal.ReadBuffer"]
517+
for name in [
518+
"librt.internal.WriteBuffer",
519+
"librt.internal.ReadBuffer",
520+
"librt.strings.BytesWriter",
521+
]
518522
}
519523

520524

mypyc/lib-rt/librt_strings.c

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
#include "pythoncapi_compat.h"
2+
3+
#define PY_SSIZE_T_CLEAN
4+
#include <Python.h>
5+
#include <stdint.h>
6+
#include "CPy.h"
7+
#include "librt_strings.h"
8+
9+
#define CPY_BOOL_ERROR 2
10+
#define CPY_NONE_ERROR 2
11+
#define CPY_NONE 1
12+
13+
//
14+
// BytesWriter
15+
//
16+
17+
// Length of the default buffer embedded directly in a BytesWriter object
18+
#define WRITER_EMBEDDED_BUF_LEN 512
19+
20+
typedef struct {
21+
PyObject_HEAD
22+
char *buf; // Beginning of the buffer
23+
Py_ssize_t len; // Current length (number of bytes written)
24+
Py_ssize_t capacity; // Total capacity of the buffer
25+
char data[WRITER_EMBEDDED_BUF_LEN]; // Default buffer
26+
} BytesWriterObject;
27+
28+
#define _WRITE(data, type, v) \
29+
do { \
30+
*(type *)(((BytesWriterObject *)data)->buf + ((BytesWriterObject *)data)->len) = v; \
31+
((BytesWriterObject *)data)->len += sizeof(type); \
32+
} while (0)
33+
34+
static PyTypeObject BytesWriterType;
35+
36+
static bool
37+
_grow_buffer(BytesWriterObject *data, Py_ssize_t n) {
38+
Py_ssize_t target = data->len + n;
39+
Py_ssize_t size = data->capacity;
40+
Py_ssize_t old_size = size;
41+
do {
42+
size *= 2;
43+
} while (target >= size);
44+
if (old_size == WRITER_EMBEDDED_BUF_LEN) {
45+
// Move from embedded buffer to heap-allocated buffer
46+
data->buf = PyMem_Malloc(size);
47+
if (data->buf != NULL) {
48+
memcpy(data->buf, data->data, WRITER_EMBEDDED_BUF_LEN);
49+
}
50+
} else {
51+
data->buf = PyMem_Realloc(data->buf, size);
52+
}
53+
if (unlikely(data->buf == NULL)) {
54+
PyErr_NoMemory();
55+
return false;
56+
}
57+
data->capacity = size;
58+
return true;
59+
}
60+
61+
static inline bool
62+
ensure_bytes_writer_size(BytesWriterObject *data, Py_ssize_t n) {
63+
if (likely(data->capacity - data->len >= n)) {
64+
return true;
65+
} else {
66+
return _grow_buffer(data, n);
67+
}
68+
}
69+
70+
static inline void
71+
BytesWriter_init_internal(BytesWriterObject *self) {
72+
self->buf = self->data;
73+
self->len = 0;
74+
self->capacity = WRITER_EMBEDDED_BUF_LEN;
75+
}
76+
77+
static PyObject*
78+
BytesWriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
79+
{
80+
if (type != &BytesWriterType) {
81+
PyErr_SetString(PyExc_TypeError, "BytesWriter cannot be subclassed");
82+
return NULL;
83+
}
84+
85+
BytesWriterObject *self = (BytesWriterObject *)type->tp_alloc(type, 0);
86+
if (self != NULL) {
87+
BytesWriter_init_internal(self);
88+
}
89+
return (PyObject *)self;
90+
}
91+
92+
static PyObject *
93+
BytesWriter_internal(void) {
94+
BytesWriterObject *self = (BytesWriterObject *)BytesWriterType.tp_alloc(&BytesWriterType, 0);
95+
if (self == NULL)
96+
return NULL;
97+
BytesWriter_init_internal(self);
98+
return (PyObject *)self;
99+
}
100+
101+
static int
102+
BytesWriter_init(BytesWriterObject *self, PyObject *args, PyObject *kwds)
103+
{
104+
if (!PyArg_ParseTuple(args, "")) {
105+
return -1;
106+
}
107+
108+
if (kwds != NULL && PyDict_Size(kwds) > 0) {
109+
PyErr_SetString(PyExc_TypeError,
110+
"BytesWriter() takes no keyword arguments");
111+
return -1;
112+
}
113+
114+
BytesWriter_init_internal(self);
115+
return 0;
116+
}
117+
118+
static void
119+
BytesWriter_dealloc(BytesWriterObject *self)
120+
{
121+
if (self->buf != self->data) {
122+
PyMem_Free(self->buf);
123+
self->buf = NULL;
124+
}
125+
Py_TYPE(self)->tp_free((PyObject *)self);
126+
}
127+
128+
static PyObject*
129+
BytesWriter_getvalue_internal(PyObject *self)
130+
{
131+
BytesWriterObject *obj = (BytesWriterObject *)self;
132+
return PyBytes_FromStringAndSize(obj->buf, obj->len);
133+
}
134+
135+
static PyObject*
136+
BytesWriter_repr(BytesWriterObject *self)
137+
{
138+
PyObject *value = BytesWriter_getvalue_internal((PyObject *)self);
139+
if (value == NULL) {
140+
return NULL;
141+
}
142+
PyObject *value_repr = PyObject_Repr(value);
143+
Py_DECREF(value);
144+
if (value_repr == NULL) {
145+
return NULL;
146+
}
147+
PyObject *result = PyUnicode_FromFormat("BytesWriter(%U)", value_repr);
148+
Py_DECREF(value_repr);
149+
return result;
150+
}
151+
152+
static PyObject*
153+
BytesWriter_getvalue(BytesWriterObject *self, PyObject *Py_UNUSED(ignored))
154+
{
155+
return PyBytes_FromStringAndSize(self->buf, self->len);
156+
}
157+
158+
static PyObject* BytesWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames);
159+
static PyObject* BytesWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames);
160+
161+
static PyMethodDef BytesWriter_methods[] = {
162+
{"append", (PyCFunction) BytesWriter_append, METH_FASTCALL | METH_KEYWORDS,
163+
PyDoc_STR("Append a single byte to the buffer")
164+
},
165+
{"write", (PyCFunction) BytesWriter_write, METH_FASTCALL | METH_KEYWORDS,
166+
PyDoc_STR("Append bytes to the buffer")
167+
},
168+
{"getvalue", (PyCFunction) BytesWriter_getvalue, METH_NOARGS,
169+
"Return the buffer content as bytes object"
170+
},
171+
{NULL} /* Sentinel */
172+
};
173+
174+
static PyTypeObject BytesWriterType = {
175+
.ob_base = PyVarObject_HEAD_INIT(NULL, 0)
176+
.tp_name = "BytesWriter",
177+
.tp_doc = PyDoc_STR("Memory buffer for building bytes objects from parts"),
178+
.tp_basicsize = sizeof(BytesWriterObject),
179+
.tp_itemsize = 0,
180+
.tp_flags = Py_TPFLAGS_DEFAULT,
181+
.tp_new = BytesWriter_new,
182+
.tp_init = (initproc) BytesWriter_init,
183+
.tp_dealloc = (destructor) BytesWriter_dealloc,
184+
.tp_methods = BytesWriter_methods,
185+
.tp_repr = (reprfunc)BytesWriter_repr,
186+
};
187+
188+
static inline bool
189+
check_bytes_writer(PyObject *data) {
190+
if (unlikely(Py_TYPE(data) != &BytesWriterType)) {
191+
PyErr_Format(
192+
PyExc_TypeError, "data must be a BytesWriter object, got %s", Py_TYPE(data)->tp_name
193+
);
194+
return false;
195+
}
196+
return true;
197+
}
198+
199+
static char
200+
BytesWriter_write_internal(BytesWriterObject *self, PyObject *value) {
201+
const char *data;
202+
Py_ssize_t size;
203+
if (likely(PyBytes_Check(value))) {
204+
data = PyBytes_AS_STRING(value);
205+
size = PyBytes_GET_SIZE(value);
206+
} else {
207+
data = PyByteArray_AS_STRING(value);
208+
size = PyByteArray_GET_SIZE(value);
209+
}
210+
// Write bytes content.
211+
if (!ensure_bytes_writer_size(self, size))
212+
return CPY_NONE_ERROR;
213+
memcpy(self->buf + self->len, data, size);
214+
self->len += size;
215+
return CPY_NONE;
216+
}
217+
218+
static PyObject*
219+
BytesWriter_write(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames) {
220+
static const char * const kwlist[] = {"value", 0};
221+
static CPyArg_Parser parser = {"O:write", kwlist, 0};
222+
PyObject *value;
223+
if (unlikely(!CPyArg_ParseStackAndKeywordsSimple(args, nargs, kwnames, &parser, &value))) {
224+
return NULL;
225+
}
226+
if (!check_bytes_writer(self)) {
227+
return NULL;
228+
}
229+
if (unlikely(!PyBytes_Check(value) && !PyByteArray_Check(value))) {
230+
PyErr_SetString(PyExc_TypeError, "value must be a bytes or bytearray object");
231+
return NULL;
232+
}
233+
if (unlikely(BytesWriter_write_internal((BytesWriterObject *)self, value) == CPY_NONE_ERROR)) {
234+
return NULL;
235+
}
236+
Py_INCREF(Py_None);
237+
return Py_None;
238+
}
239+
240+
static inline char
241+
BytesWriter_append_internal(BytesWriterObject *self, uint8_t value) {
242+
if (!ensure_bytes_writer_size(self, 1))
243+
return CPY_NONE_ERROR;
244+
_WRITE(self, uint8_t, value);
245+
return CPY_NONE;
246+
}
247+
248+
static PyObject*
249+
BytesWriter_append(PyObject *self, PyObject *const *args, size_t nargs, PyObject *kwnames) {
250+
static const char * const kwlist[] = {"value", 0};
251+
static CPyArg_Parser parser = {"O:append", kwlist, 0};
252+
PyObject *value;
253+
if (unlikely(!CPyArg_ParseStackAndKeywordsSimple(args, nargs, kwnames, &parser, &value))) {
254+
return NULL;
255+
}
256+
if (!check_bytes_writer(self)) {
257+
return NULL;
258+
}
259+
uint8_t unboxed = CPyLong_AsUInt8(value);
260+
if (unlikely(unboxed == CPY_LL_UINT_ERROR && PyErr_Occurred())) {
261+
CPy_TypeError("u8", value);
262+
return NULL;
263+
}
264+
if (unlikely(BytesWriter_append_internal((BytesWriterObject *)self, unboxed) == CPY_NONE_ERROR)) {
265+
return NULL;
266+
}
267+
Py_INCREF(Py_None);
268+
return Py_None;
269+
}
270+
271+
static PyTypeObject *
272+
BytesWriter_type_internal(void) {
273+
return &BytesWriterType; // Return borrowed reference
274+
};
275+
276+
static PyMethodDef librt_strings_module_methods[] = {
277+
{NULL, NULL, 0, NULL}
278+
};
279+
280+
#ifdef MYPYC_EXPERIMENTAL
281+
282+
static int
283+
strings_abi_version(void) {
284+
return LIBRT_STRINGS_ABI_VERSION;
285+
}
286+
287+
static int
288+
strings_api_version(void) {
289+
return LIBRT_STRINGS_API_VERSION;
290+
}
291+
292+
#endif
293+
294+
static int
295+
librt_strings_module_exec(PyObject *m)
296+
{
297+
#ifdef MYPYC_EXPERIMENTAL
298+
if (PyType_Ready(&BytesWriterType) < 0) {
299+
return -1;
300+
}
301+
if (PyModule_AddObjectRef(m, "BytesWriter", (PyObject *) &BytesWriterType) < 0) {
302+
return -1;
303+
}
304+
305+
// Export mypy internal C API, be careful with the order!
306+
static void *librt_strings_api[LIBRT_STRINGS_API_LEN] = {
307+
(void *)strings_abi_version,
308+
(void *)strings_api_version,
309+
(void *)BytesWriter_internal,
310+
(void *)BytesWriter_getvalue_internal,
311+
(void *)BytesWriter_append_internal,
312+
(void *)BytesWriter_write_internal,
313+
(void *)BytesWriter_type_internal,
314+
};
315+
PyObject *c_api_object = PyCapsule_New((void *)librt_strings_api, "librt.strings._C_API", NULL);
316+
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {
317+
return -1;
318+
}
319+
#endif
320+
return 0;
321+
}
322+
323+
static PyModuleDef_Slot librt_strings_module_slots[] = {
324+
{Py_mod_exec, librt_strings_module_exec},
325+
#ifdef Py_MOD_GIL_NOT_USED
326+
{Py_mod_gil, Py_MOD_GIL_NOT_USED},
327+
#endif
328+
{0, NULL}
329+
};
330+
331+
static PyModuleDef librt_strings_module = {
332+
.m_base = PyModuleDef_HEAD_INIT,
333+
.m_name = "strings",
334+
.m_doc = "Utilities for working with str and bytes objects",
335+
.m_size = 0,
336+
.m_methods = librt_strings_module_methods,
337+
.m_slots = librt_strings_module_slots,
338+
};
339+
340+
PyMODINIT_FUNC
341+
PyInit_strings(void)
342+
{
343+
return PyModuleDef_Init(&librt_strings_module);
344+
}

0 commit comments

Comments
 (0)