Skip to content

Commit e437ff8

Browse files
committed
make the default string dtype not have an NA object
1 parent 30d7812 commit e437ff8

File tree

11 files changed

+655
-385
lines changed

11 files changed

+655
-385
lines changed

stringdtype/meson.build

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ py.install_sources(
4343
[
4444
'stringdtype/__init__.py',
4545
'stringdtype/scalar.py',
46-
'stringdtype/missing.py',
4746
],
4847
subdir: 'stringdtype',
4948
pure: false

stringdtype/stringdtype/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
"""
44

5-
from .missing import NA # isort: skip
65
from .scalar import StringScalar # isort: skip
76
from ._main import StringDType, _memory_usage
87

stringdtype/stringdtype/missing.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

stringdtype/stringdtype/src/casts.c

Lines changed: 266 additions & 180 deletions
Large diffs are not rendered by default.

stringdtype/stringdtype/src/dtype.c

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
#include "static_string.h"
55

66
PyTypeObject *StringScalar_Type = NULL;
7-
static PyTypeObject *StringNA_Type = NULL;
8-
PyObject *NA_OBJ = NULL;
97

108
/*
119
* Internal helper to create new instances
@@ -20,7 +18,7 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
2018
return NULL;
2119
}
2220

23-
Py_INCREF(na_object);
21+
Py_XINCREF(na_object);
2422
((StringDTypeObject *)new)->na_object = na_object;
2523
((StringDTypeObject *)new)->coerce = coerce;
2624

@@ -108,7 +106,7 @@ string_discover_descriptor_from_pyobject(PyTypeObject *NPY_UNUSED(cls),
108106
return NULL;
109107
}
110108

111-
PyArray_Descr *ret = (PyArray_Descr *)new_stringdtype_instance(NA_OBJ, 1);
109+
PyArray_Descr *ret = (PyArray_Descr *)new_stringdtype_instance(NULL, 1);
112110
if (ret == NULL) {
113111
return NULL;
114112
}
@@ -131,7 +129,7 @@ stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
131129

132130
// setting NA *must* check pointer equality since NA types might not
133131
// allow equality
134-
if (obj == na_object) {
132+
if (na_object != NULL && obj == na_object) {
135133
// do nothing, ssfree already NULLed the struct ssdata points to
136134
// so it already contains a NA value
137135
}
@@ -173,11 +171,17 @@ stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
173171
{
174172
PyObject *val_obj = NULL;
175173
ss *sdata = (ss *)dataptr;
174+
int hasnull = descr->na_object != NULL;
176175

177176
if (ss_isnull(sdata)) {
178-
PyObject *na_object = descr->na_object;
179-
Py_INCREF(na_object);
180-
val_obj = na_object;
177+
if (hasnull) {
178+
PyObject *na_object = descr->na_object;
179+
Py_INCREF(na_object);
180+
val_obj = na_object;
181+
}
182+
else {
183+
val_obj = PyUnicode_FromStringAndSize("", 0);
184+
}
181185
}
182186
else {
183187
char *data = sdata->buf;
@@ -213,17 +217,37 @@ nonzero(void *data, void *NPY_UNUSED(arr))
213217
// Implementation of PyArray_CompareFunc.
214218
// Compares unicode strings by their code points.
215219
int
216-
compare(void *a, void *b, void *NPY_UNUSED(arr))
220+
compare(void *a, void *b, void *arr)
221+
{
222+
StringDTypeObject *descr = (StringDTypeObject *)PyArray_DESCR(arr);
223+
return _compare(a, b, descr);
224+
}
225+
226+
int
227+
_compare(void *a, void *b, StringDTypeObject *descr)
217228
{
218-
ss *ss_a = (ss *)a;
219-
ss *ss_b = (ss *)b;
229+
int hasnull = descr->na_object != NULL;
230+
const ss *ss_a = (ss *)a;
231+
const ss *ss_b = (ss *)b;
220232
int a_is_null = ss_isnull(ss_a);
221233
int b_is_null = ss_isnull(ss_b);
222-
if (a_is_null) {
223-
return 1;
224-
}
225-
else if (b_is_null) {
226-
return -1;
234+
if (NPY_UNLIKELY(a_is_null || b_is_null)) {
235+
if (hasnull) {
236+
if (a_is_null) {
237+
return 1;
238+
}
239+
else if (b_is_null) {
240+
return -1;
241+
}
242+
}
243+
else {
244+
if (a_is_null) {
245+
ss_a = &EMPTY_STRING;
246+
}
247+
if (b_is_null) {
248+
ss_b = &EMPTY_STRING;
249+
}
250+
}
227251
}
228252
return strcmp(ss_a->buf, ss_b->buf);
229253
}
@@ -344,22 +368,18 @@ static PyType_Slot StringDType_Slots[] = {
344368
static PyObject *
345369
stringdtype_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwds)
346370
{
347-
static char *kwargs_strs[] = {"size", "na_object", "coerce", NULL};
371+
static char *kwargs_strs[] = {"size", "coerce", "na_object", NULL};
348372

349373
long size = 0;
350374
PyObject *na_object = NULL;
351375
int coerce = 1;
352376

353-
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|lOp:StringDType",
354-
kwargs_strs, &size, &na_object,
355-
&coerce)) {
377+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|lpO:StringDType",
378+
kwargs_strs, &size, &coerce,
379+
&na_object)) {
356380
return NULL;
357381
}
358382

359-
if (na_object == NULL) {
360-
na_object = NA_OBJ;
361-
}
362-
363383
PyObject *ret = new_stringdtype_instance(na_object, coerce);
364384

365385
return ret;
@@ -379,12 +399,11 @@ stringdtype_repr(StringDTypeObject *self)
379399
PyObject *na_object = self->na_object;
380400
int coerce = self->coerce;
381401

382-
// TODO: handle non-default NA
383-
if (na_object != NA_OBJ && coerce == 0) {
402+
if (na_object != NULL && coerce == 0) {
384403
ret = PyUnicode_FromFormat("StringDType(na_object=%R, coerce=False)",
385404
na_object);
386405
}
387-
else if (na_object != NA_OBJ) {
406+
else if (na_object != NULL) {
388407
ret = PyUnicode_FromFormat("StringDType(na_object=%R)", na_object);
389408
}
390409
else if (coerce == 0) {
@@ -424,9 +443,16 @@ stringdtype__reduce__(StringDTypeObject *self)
424443

425444
PyTuple_SET_ITEM(ret, 0, obj);
426445

427-
PyTuple_SET_ITEM(ret, 1,
428-
Py_BuildValue("(NOi)", PyLong_FromLong(0),
429-
self->na_object, self->coerce));
446+
if (self->na_object != NULL) {
447+
PyTuple_SET_ITEM(ret, 1,
448+
Py_BuildValue("(NiO)", PyLong_FromLong(0),
449+
self->coerce, self->na_object));
450+
}
451+
else {
452+
PyTuple_SET_ITEM(
453+
ret, 1,
454+
Py_BuildValue("(Ni)", PyLong_FromLong(0), self->coerce));
455+
}
430456

431457
PyTuple_SET_ITEM(ret, 2, Py_BuildValue("(l)", PICKLE_VERSION));
432458

@@ -571,11 +597,11 @@ init_string_dtype(void)
571597
return 0;
572598
}
573599

574-
int
575-
init_string_na_object(PyObject *mod)
600+
void
601+
gil_error(PyObject *type, const char *msg)
576602
{
577-
NA_OBJ = PyObject_GetAttrString(mod, "NA");
578-
StringNA_Type = Py_TYPE(NA_OBJ);
579-
Py_INCREF(StringNA_Type);
580-
return 0;
603+
PyGILState_STATE gstate;
604+
gstate = PyGILState_Ensure();
605+
PyErr_SetString(type, msg);
606+
PyGILState_Release(gstate);
581607
}

stringdtype/stringdtype/src/dtype.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ typedef struct {
2929

3030
extern StringDType_type StringDType;
3131
extern PyTypeObject *StringScalar_Type;
32-
extern PyObject *NA_OBJ;
3332

3433
PyObject *
3534
new_stringdtype_instance(PyObject *na_object, int coerce);
@@ -38,14 +37,18 @@ int
3837
init_string_dtype(void);
3938

4039
int
41-
compare(void *, void *, void *);
40+
_compare(void *, void *, StringDTypeObject *);
4241

4342
int
4443
init_string_na_object(PyObject *mod);
4544

4645
int
4746
stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr);
4847

48+
// set the python error indicator when the gil is released
49+
void
50+
gil_error(PyObject *type, const char *msg);
51+
4952
// from dtypemeta.h, not public in numpy
5053
#define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
5154

stringdtype/stringdtype/src/main.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ PyInit__main(void)
114114

115115
Py_DECREF(mod);
116116

117-
if (init_string_na_object(mod) < 0) {
118-
goto error;
119-
}
120-
121117
if (init_string_dtype() < 0) {
122118
goto error;
123119
}

stringdtype/stringdtype/src/static_string.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include "static_string.h"
22

3-
static ss EMPTY = {0, ""};
3+
const ss EMPTY_STRING = {0, ""};
44

55
int
66
ssnewlen(const char *init, size_t len, ss *to_init)
@@ -11,7 +11,7 @@ ssnewlen(const char *init, size_t len, ss *to_init)
1111

1212
if (len == 0) {
1313
to_init->len = 0;
14-
to_init->buf = EMPTY.buf;
14+
to_init->buf = EMPTY_STRING.buf;
1515
}
1616

1717
// one extra byte for null terminator
@@ -38,7 +38,7 @@ void
3838
ssfree(ss *str)
3939
{
4040
if (str->buf != NULL) {
41-
if (str->buf != EMPTY.buf) {
41+
if (str->buf != EMPTY_STRING.buf) {
4242
free(str->buf);
4343
}
4444
str->buf = NULL;
@@ -47,7 +47,7 @@ ssfree(ss *str)
4747
}
4848

4949
int
50-
ssdup(ss *in, ss *out)
50+
ssdup(const ss *in, ss *out)
5151
{
5252
if (ss_isnull(in)) {
5353
out->len = 0;
@@ -79,7 +79,7 @@ ssnewemptylen(size_t num_bytes, ss *out)
7979
}
8080

8181
int
82-
ss_isnull(ss *in)
82+
ss_isnull(const ss *in)
8383
{
8484
if (in->len == 0 && in->buf == NULL) {
8585
return 1;

stringdtype/stringdtype/src/static_string.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ typedef struct ss {
99
char *buf;
1010
} ss;
1111

12+
extern const ss EMPTY_STRING;
13+
1214
// Allocates a new buffer for *to_init*, filling with the copied contents of
1315
// *init* and sets *to_init->len* to *len*. Returns -1 if malloc fails and -2
1416
// if *to_init* is not empty. Returns 0 on success.
@@ -24,7 +26,7 @@ ssfree(ss *str);
2426
// *out*. Returns -1 if malloc fails and -2 if *out* is not empty. Returns 0 on
2527
// success.
2628
int
27-
ssdup(ss *in, ss *out);
29+
ssdup(const ss *in, ss *out);
2830

2931
// Allocates a new string buffer for *out* with enough capacity to store
3032
// *num_bytes* of text. The actual allocation is num_bytes + 1 bytes, to
@@ -37,6 +39,6 @@ ssnewemptylen(size_t num_bytes, ss *out);
3739
// Determine if *in* corresponds to a NULL ss struct (e.g. len is zero and buf
3840
// is NULL. Returns 1 if this is the case and zero otherwise. Cannot fail.
3941
int
40-
ss_isnull(ss *in);
42+
ss_isnull(const ss *in);
4143

4244
#endif /*_NPY_STATIC_STRING_H */

0 commit comments

Comments
 (0)