Skip to content

Commit 7c5bcbb

Browse files
authored
Merge pull request #86 from ngoldbaum/alloc-refactor
Better naming of public symbols and small string optimization
2 parents fb1b5ba + 7972b7b commit 7c5bcbb

File tree

8 files changed

+706
-541
lines changed

8 files changed

+706
-541
lines changed

stringdtype/stringdtype/src/casts.c

Lines changed: 234 additions & 285 deletions
Large diffs are not rendered by default.

stringdtype/stringdtype/src/dtype.c

Lines changed: 84 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,30 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
2020

2121
Py_XINCREF(na_object);
2222
((StringDTypeObject *)new)->na_object = na_object;
23+
npy_packed_static_string packed_na_name = *NPY_EMPTY_STRING;
24+
npy_packed_static_string packed_default_string = *NPY_EMPTY_STRING;
2325
int hasnull = na_object != NULL;
2426
int has_nan_na = 0;
2527
int has_string_na = 0;
26-
ss default_string = EMPTY_STRING;
2728
if (hasnull) {
2829
// first check for a string
2930
if (PyUnicode_Check(na_object)) {
3031
has_string_na = 1;
3132
Py_ssize_t size = 0;
3233
const char *buf = PyUnicode_AsUTF8AndSize(na_object, &size);
33-
default_string.len = size;
34-
// discards const, how to avoid?
35-
default_string.buf = (char *)buf;
34+
int res = npy_string_newsize(buf, (size_t)size,
35+
&packed_default_string);
36+
if (res == -1) {
37+
PyErr_NoMemory();
38+
Py_DECREF(new);
39+
return NULL;
40+
}
41+
else if (res == -2) {
42+
// this should never happen
43+
assert(0);
44+
Py_DECREF(new);
45+
return NULL;
46+
}
3647
}
3748
else {
3849
// treat as nan-like if != comparison returns a object whose truth
@@ -53,15 +64,50 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
5364
}
5465
Py_DECREF(eq);
5566
}
67+
PyObject *na_pystr = PyObject_Str(na_object);
68+
if (na_pystr == NULL) {
69+
Py_DECREF(new);
70+
return NULL;
71+
}
72+
73+
Py_ssize_t size = 0;
74+
const char *utf8_ptr = PyUnicode_AsUTF8AndSize(na_pystr, &size);
75+
// discard const to initialize buffer
76+
int res = npy_string_newsize(utf8_ptr, (size_t)size, &packed_na_name);
77+
if (res == -1) {
78+
PyErr_NoMemory();
79+
Py_DECREF(new);
80+
return NULL;
81+
}
82+
else if (res == -2) {
83+
// this should never happen
84+
assert(0);
85+
Py_DECREF(new);
86+
return NULL;
87+
}
88+
Py_DECREF(na_pystr);
5689
}
57-
((StringDTypeObject *)new)->has_nan_na = has_nan_na;
58-
((StringDTypeObject *)new)->has_string_na = has_string_na;
59-
((StringDTypeObject *)new)->default_string = default_string;
60-
((StringDTypeObject *)new)->coerce = coerce;
90+
91+
StringDTypeObject *snew = (StringDTypeObject *)new;
92+
93+
snew->has_nan_na = has_nan_na;
94+
snew->has_string_na = has_string_na;
95+
snew->packed_default_string = packed_default_string;
96+
snew->packed_na_name = packed_na_name;
97+
snew->coerce = coerce;
98+
99+
npy_static_string default_string = {0, NULL};
100+
npy_load_string(&snew->packed_default_string, &default_string);
101+
102+
npy_static_string na_name = {0, NULL};
103+
npy_load_string(&snew->packed_na_name, &na_name);
104+
105+
snew->na_name = na_name;
106+
snew->default_string = default_string;
61107

62108
PyArray_Descr *base = (PyArray_Descr *)new;
63-
base->elsize = sizeof(ss);
64-
base->alignment = _Alignof(ss);
109+
base->elsize = sizeof(npy_static_string);
110+
base->alignment = _Alignof(npy_static_string);
65111
base->flags |= NPY_NEEDS_INIT;
66112
base->flags |= NPY_LIST_PICKLE;
67113
base->flags |= NPY_ITEM_REFCOUNT;
@@ -161,20 +207,19 @@ string_discover_descriptor_from_pyobject(PyTypeObject *NPY_UNUSED(cls),
161207
int
162208
stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
163209
{
164-
ss *sdata = (ss *)dataptr;
210+
npy_packed_static_string *sdata = (npy_packed_static_string *)dataptr;
165211

166212
// free if dataptr holds preexisting string data,
167-
// ssfree does a NULL check
168-
ssfree(sdata);
213+
// npy_string_free does a NULL check and checks for small strings
214+
npy_string_free(sdata);
169215

170216
// borrow reference
171217
PyObject *na_object = descr->na_object;
172218

173219
// setting NA *must* check pointer equality since NA types might not
174220
// allow equality
175221
if (na_object != NULL && obj == na_object) {
176-
// do nothing, ssfree already NULLed the struct ssdata points to
177-
// so it already contains a NA value
222+
*sdata = *NPY_NULL_STRING;
178223
}
179224
else {
180225
PyObject *val_obj = get_value(obj, descr->coerce);
@@ -190,8 +235,7 @@ stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
190235
return -1;
191236
}
192237

193-
// copies contents of val into item_val->buf
194-
int res = ssnewlen(val, length, sdata);
238+
int res = npy_string_newsize(val, length, sdata);
195239

196240
if (res == -1) {
197241
PyErr_NoMemory();
@@ -213,10 +257,11 @@ static PyObject *
213257
stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
214258
{
215259
PyObject *val_obj = NULL;
216-
ss *sdata = (ss *)dataptr;
260+
npy_packed_static_string *psdata = (npy_packed_static_string *)dataptr;
261+
npy_static_string sdata = {0, NULL};
217262
int hasnull = descr->na_object != NULL;
218263

219-
if (ss_isnull(sdata)) {
264+
if (npy_load_string(psdata, &sdata)) {
220265
if (hasnull) {
221266
PyObject *na_object = descr->na_object;
222267
Py_INCREF(na_object);
@@ -227,9 +272,7 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
227272
}
228273
}
229274
else {
230-
char *data = sdata->buf;
231-
size_t len = sdata->len;
232-
val_obj = PyUnicode_FromStringAndSize(data, len);
275+
val_obj = PyUnicode_FromStringAndSize(sdata.buf, sdata.size);
233276
if (val_obj == NULL) {
234277
return NULL;
235278
}
@@ -254,7 +297,7 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
254297
npy_bool
255298
nonzero(void *data, void *NPY_UNUSED(arr))
256299
{
257-
return ((ss *)data)->len != 0;
300+
return npy_string_size((npy_packed_static_string *)data) != 0;
258301
}
259302

260303
// Implementation of PyArray_CompareFunc.
@@ -278,11 +321,13 @@ _compare(void *a, void *b, StringDTypeObject *descr)
278321
return 0;
279322
}
280323
}
281-
const ss *default_string = &descr->default_string;
282-
const ss *ss_a = (ss *)a;
283-
const ss *ss_b = (ss *)b;
284-
int a_is_null = ss_isnull(ss_a);
285-
int b_is_null = ss_isnull(ss_b);
324+
npy_static_string *default_string = &descr->default_string;
325+
const npy_packed_static_string *ps_a = (npy_packed_static_string *)a;
326+
npy_static_string s_a = {0, NULL};
327+
int a_is_null = npy_load_string(ps_a, &s_a);
328+
const npy_packed_static_string *ps_b = (npy_packed_static_string *)b;
329+
npy_static_string s_b = {0, NULL};
330+
int b_is_null = npy_load_string(ps_b, &s_b);
286331
if (NPY_UNLIKELY(a_is_null || b_is_null)) {
287332
if (hasnull && !has_string_na) {
288333
if (has_nan_na) {
@@ -303,22 +348,22 @@ _compare(void *a, void *b, StringDTypeObject *descr)
303348
}
304349
else {
305350
if (a_is_null) {
306-
ss_a = default_string;
351+
s_a = *default_string;
307352
}
308353
if (b_is_null) {
309-
ss_b = default_string;
354+
s_b = *default_string;
310355
}
311356
}
312357
}
313-
return sscmp(ss_a, ss_b);
358+
return npy_string_cmp(&s_a, &s_b);
314359
}
315360

316361
// PyArray_ArgFunc
317362
// The max element is the one with the highest unicode code point.
318363
int
319364
argmax(void *data, npy_intp n, npy_intp *max_ind, void *arr)
320365
{
321-
ss *dptr = (ss *)data;
366+
npy_packed_static_string *dptr = (npy_packed_static_string *)data;
322367
*max_ind = 0;
323368
for (int i = 1; i < n; i++) {
324369
if (compare(&dptr[i], &dptr[*max_ind], arr) > 0) {
@@ -333,7 +378,7 @@ argmax(void *data, npy_intp n, npy_intp *max_ind, void *arr)
333378
int
334379
argmin(void *data, npy_intp n, npy_intp *min_ind, void *arr)
335380
{
336-
ss *dptr = (ss *)data;
381+
npy_packed_static_string *dptr = (npy_packed_static_string *)data;
337382
*min_ind = 0;
338383
for (int i = 1; i < n; i++) {
339384
if (compare(&dptr[i], &dptr[*min_ind], arr) < 0) {
@@ -358,8 +403,8 @@ stringdtype_clear_loop(void *NPY_UNUSED(traverse_context),
358403
{
359404
while (size--) {
360405
if (data != NULL) {
361-
ssfree((ss *)data);
362-
memset(data, 0, sizeof(ss));
406+
npy_string_free((npy_packed_static_string *)data);
407+
memset(data, 0, sizeof(npy_packed_static_string));
363408
}
364409
data += stride;
365410
}
@@ -388,9 +433,7 @@ stringdtype_fill_zero_loop(void *NPY_UNUSED(traverse_context),
388433
NpyAuxData *NPY_UNUSED(auxdata))
389434
{
390435
while (size--) {
391-
if (ssnewlen("", 0, (ss *)(data)) < 0) {
392-
return -1;
393-
}
436+
*(npy_packed_static_string *)(data) = *NPY_EMPTY_STRING;
394437
data += stride;
395438
}
396439
return 0;
@@ -538,6 +581,9 @@ stringdtype_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwds)
538581
static void
539582
stringdtype_dealloc(StringDTypeObject *self)
540583
{
584+
Py_XDECREF(self->na_object);
585+
npy_string_free(&self->packed_default_string);
586+
npy_string_free(&self->packed_na_name);
541587
PyArrayDescr_Type.tp_dealloc((PyObject *)self);
542588
}
543589

stringdtype/stringdtype/src/dtype.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ typedef struct {
2525
int coerce;
2626
int has_nan_na;
2727
int has_string_na;
28-
ss default_string;
28+
npy_static_string default_string;
29+
npy_packed_static_string packed_default_string;
30+
npy_static_string na_name;
31+
npy_packed_static_string packed_na_name;
2932
} StringDTypeObject;
3033

3134
typedef struct {

stringdtype/stringdtype/src/main.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,11 @@ _memory_usage(PyObject *NPY_UNUSED(self), PyObject *obj)
5858
npy_intp count = *innersizeptr;
5959

6060
while (count--) {
61-
// +1 byte for the null terminator
62-
memory_usage += ((ss *)in)->len + 1;
61+
size_t size = npy_string_size(((npy_packed_static_string *)in));
62+
// FIXME: add a way for a string to report its heap size usage
63+
if (size > (sizeof(npy_static_string) - 1)) {
64+
memory_usage += size;
65+
}
6366
in += stride;
6467
}
6568

@@ -75,7 +78,7 @@ _memory_usage(PyObject *NPY_UNUSED(self), PyObject *obj)
7578
static PyMethodDef string_methods[] = {
7679
{"_memory_usage", _memory_usage, METH_O,
7780
"get memory usage for an array"},
78-
{NULL},
81+
{NULL, NULL, 0, NULL},
7982
};
8083

8184
static struct PyModuleDef moduledef = {

0 commit comments

Comments
 (0)