Skip to content

Commit ba1aa5e

Browse files
committed
Initial support for missing data
1 parent 26c270c commit ba1aa5e

File tree

12 files changed

+228
-93
lines changed

12 files changed

+228
-93
lines changed

stringdtype/meson.build

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ srcs = [
3535
py.install_sources(
3636
[
3737
'stringdtype/__init__.py',
38-
'stringdtype/scalar.py'
38+
'stringdtype/scalar.py',
39+
'stringdtype/missing.py',
3940
],
4041
subdir: 'stringdtype'
4142
)

stringdtype/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,6 @@ per-file-ignores = {"__init__.py" = ["F401"]}
3131

3232
[tool.meson-python.args]
3333
dist = []
34-
setup = ["-Ddebug=true", "-Doptimization=2"]
34+
setup = ["-Ddebug=true", "-Doptimization=0"]
3535
compile = []
3636
install = []

stringdtype/stringdtype/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
33
"""
44

5+
from .missing import NA # isort: skip
56
from .scalar import StringScalar # isort: skip
67
from ._main import StringDType, _memory_usage
78

89
__all__ = [
10+
"NA",
911
"StringDType",
1012
"StringScalar",
1113
"_memory_usage",

stringdtype/stringdtype/missing.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class NAType:
2+
def __repr__(self):
3+
return "stringdtype.NA"
4+
5+
6+
NA = NAType()

stringdtype/stringdtype/src/casts.c

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,13 @@ string_to_string(PyArrayMethod_Context *NPY_UNUSED(context),
5353
npy_intp out_stride = strides[1];
5454

5555
ss *s = NULL;
56+
ss *os = NULL;
5657

5758
while (N--) {
58-
// *out* may be reallocated later; *in->buf* may point to a statically
59-
// allocated empty ss struct, so we need to load the string into an
60-
// intermediate buffer *s* to avoid the possibility of freeing static
61-
// data later on.
62-
load_string(in, (ss **)&s);
63-
ssfree((ss *)out);
64-
if (ssdup((ss *)s, (ss *)out) < 0) {
59+
s = (ss *)in;
60+
os = (ss *)out;
61+
ssfree(os);
62+
if (ssdup(s, os) < 0) {
6563
gil_error(PyExc_MemoryError, "ssdup failed");
6664
return -1;
6765
}
@@ -338,9 +336,18 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
338336
ss *s = NULL;
339337

340338
while (N--) {
341-
load_string(in, &s);
342-
unsigned char *this_string = (unsigned char *)(s->buf);
343-
size_t n_bytes = s->len;
339+
s = (ss *)in;
340+
unsigned char *this_string = NULL;
341+
size_t n_bytes;
342+
if (ss_isnull(s)) {
343+
// lossy but not much else we can do
344+
this_string = (unsigned char *)"NA";
345+
n_bytes = 3;
346+
}
347+
else {
348+
this_string = (unsigned char *)(s->buf);
349+
n_bytes = s->len;
350+
}
344351
size_t tot_n_bytes = 0;
345352

346353
for (int i = 0; i < max_out_size; i++) {
@@ -401,7 +408,7 @@ string_to_bool_resolve_descriptors(PyObject *NPY_UNUSED(self),
401408
}
402409

403410
static int
404-
string_to_bool(PyArrayMethod_Context *context, char *const data[],
411+
string_to_bool(PyArrayMethod_Context *NPY_UNUSED(context), char *const data[],
405412
npy_intp const dimensions[], npy_intp const strides[],
406413
NpyAuxData *NPY_UNUSED(auxdata))
407414
{
@@ -415,8 +422,12 @@ string_to_bool(PyArrayMethod_Context *context, char *const data[],
415422
ss *s = NULL;
416423

417424
while (N--) {
418-
load_string(in, &s);
419-
if (s->len == 0) {
425+
s = (ss *)in;
426+
if (ss_isnull(s)) {
427+
// numpy treats NaN as truthy, following python
428+
*out = (npy_bool)1;
429+
}
430+
else if (s->len == 0) {
420431
*out = (npy_bool)0;
421432
}
422433
else {

stringdtype/stringdtype/src/dtype.c

Lines changed: 114 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include "static_string.h"
55

66
PyTypeObject *StringScalar_Type = NULL;
7+
static PyTypeObject *StringNA_Type = NULL;
8+
static PyObject *NA_OBJ = NULL;
79

810
/*
911
* Internal helper to create new instances
@@ -80,14 +82,35 @@ string_discover_descriptor_from_pyobject(PyArray_DTypeMeta *NPY_UNUSED(cls),
8082
static PyObject *
8183
get_value(PyObject *scalar)
8284
{
83-
PyObject *ret_bytes = NULL;
85+
PyObject *ret = NULL;
8486
PyTypeObject *scalar_type = Py_TYPE(scalar);
8587
// FIXME: handle bytes too
8688
if ((scalar_type == &PyUnicode_Type) ||
8789
(scalar_type == StringScalar_Type)) {
8890
// attempt to decode as UTF8
89-
ret_bytes = PyUnicode_AsUTF8String(scalar);
90-
if (ret_bytes == NULL) {
91+
ret = PyUnicode_AsUTF8String(scalar);
92+
if (ret == NULL) {
93+
PyErr_SetString(
94+
PyExc_TypeError,
95+
"Can only store UTF8 text in a StringDType array.");
96+
return NULL;
97+
}
98+
}
99+
else if (scalar_type == StringNA_Type) {
100+
ret = scalar;
101+
Py_INCREF(ret);
102+
}
103+
// store np.nan as NA
104+
else if (scalar_type == &PyFloat_Type) {
105+
double scalar_val = PyFloat_AsDouble(scalar);
106+
if ((scalar_val == -1.0) && PyErr_Occurred()) {
107+
return NULL;
108+
}
109+
if (npy_isnan(scalar_val)) {
110+
ret = NA_OBJ;
111+
Py_INCREF(ret);
112+
}
113+
else {
91114
PyErr_SetString(
92115
PyExc_TypeError,
93116
"Can only store UTF8 text in a StringDType array.");
@@ -99,7 +122,7 @@ get_value(PyObject *scalar)
99122
"Can only store String text in a StringDType array.");
100123
return NULL;
101124
}
102-
return ret_bytes;
125+
return ret;
103126
}
104127

105128
// Take a python object `obj` and insert it into the array of dtype `descr` at
@@ -109,58 +132,74 @@ stringdtype_setitem(StringDTypeObject *NPY_UNUSED(descr), PyObject *obj,
109132
char **dataptr)
110133
{
111134
PyObject *val_obj = get_value(obj);
135+
112136
if (val_obj == NULL) {
113137
return -1;
114138
}
115139

116-
char *val = NULL;
117-
Py_ssize_t length = 0;
118-
if (PyBytes_AsStringAndSize(val_obj, &val, &length) == -1) {
119-
return -1;
120-
}
140+
ss *sdata = (ss *)dataptr;
121141

122142
// free if dataptr holds preexisting string data,
123143
// ssfree does a NULL check
124-
ssfree((ss *)dataptr);
144+
ssfree(sdata);
125145

126-
// copies contents of val into item_val->buf
127-
int res = ssnewlen(val, length, (ss *)dataptr);
146+
// RichCompareBool short-circuits to a pointer comparison fast-path
147+
// so no need to do pointer comparison first
148+
int eq_res = PyObject_RichCompareBool(val_obj, NA_OBJ, Py_EQ);
128149

129-
// val_obj must stay alive until here to ensure *val* doesn't get
130-
// deallocated
131-
Py_DECREF(val_obj);
150+
if (eq_res < 0) {
151+
goto error;
152+
}
132153

133-
if (res == -1) {
134-
PyErr_NoMemory();
135-
return -1;
154+
if (eq_res == 1) {
155+
sdata = NULL;
136156
}
137-
else if (res == -2) {
138-
// this should never happen
139-
assert(0);
157+
else {
158+
char *val = NULL;
159+
Py_ssize_t length = 0;
160+
if (PyBytes_AsStringAndSize(val_obj, &val, &length) == -1) {
161+
goto error;
162+
}
163+
164+
// copies contents of val into item_val->buf
165+
int res = ssnewlen(val, length, sdata);
166+
167+
if (res == -1) {
168+
PyErr_NoMemory();
169+
goto error;
170+
}
171+
else if (res == -2) {
172+
// this should never happen
173+
assert(0);
174+
goto error;
175+
}
140176
}
141177

178+
Py_DECREF(val_obj);
142179
return 0;
180+
181+
error:
182+
Py_DECREF(val_obj);
183+
return -1;
143184
}
144185

145186
static PyObject *
146187
stringdtype_getitem(StringDTypeObject *NPY_UNUSED(descr), char **dataptr)
147188
{
148-
char *data;
149-
size_t len;
189+
PyObject *val_obj = NULL;
190+
ss *sdata = (ss *)dataptr;
150191

151-
if (*dataptr == NULL) {
152-
data = "\0";
153-
len = 0;
192+
if (ss_isnull(sdata)) {
193+
Py_INCREF(NA_OBJ);
194+
val_obj = NA_OBJ;
154195
}
155196
else {
156-
data = ((ss *)dataptr)->buf;
157-
len = ((ss *)dataptr)->len;
158-
}
159-
160-
PyObject *val_obj = PyUnicode_FromStringAndSize(data, len);
161-
162-
if (val_obj == NULL) {
163-
return NULL;
197+
char *data = sdata->buf;
198+
size_t len = sdata->len;
199+
val_obj = PyUnicode_FromStringAndSize(data, len);
200+
if (val_obj == NULL) {
201+
return NULL;
202+
}
164203
}
165204

166205
/*
@@ -190,10 +229,22 @@ nonzero(void *data, void *NPY_UNUSED(arr))
190229
int
191230
compare(void *a, void *b, void *NPY_UNUSED(arr))
192231
{
193-
ss *ss_a = NULL;
194-
ss *ss_b = NULL;
195-
load_string(a, &ss_a);
196-
load_string(b, &ss_b);
232+
ss *ss_a = (ss *)a;
233+
ss *ss_b = (ss *)b;
234+
int a_is_null = ss_isnull(ss_a);
235+
int b_is_null = ss_isnull(ss_b);
236+
if (a_is_null || b_is_null) {
237+
// numpy sorts NaNs to the end of the array
238+
// pandas sorts NAs to the end as well
239+
// so we follow that behavior here
240+
if (!b_is_null) {
241+
return 1;
242+
}
243+
else if (!a_is_null) {
244+
return -1;
245+
}
246+
return 0;
247+
}
197248
return strcmp(ss_a->buf, ss_b->buf);
198249
}
199250

@@ -265,6 +316,21 @@ stringdtype_get_clear_loop(void *NPY_UNUSED(traverse_context),
265316
return 0;
266317
}
267318

319+
static int
320+
stringdtype_fill_zero_value(PyArrayObject *arr)
321+
{
322+
char *buf = PyArray_DATA(arr);
323+
npy_intp sz = PyArray_SIZE(arr);
324+
int elsize = PyArray_DESCR(arr)->elsize;
325+
326+
for (npy_intp i = 0; i < sz; i++) {
327+
if (ssnewlen("", 0, (ss *)(buf + i * elsize)) < 0) {
328+
return -1;
329+
}
330+
}
331+
return 0;
332+
}
333+
268334
static PyType_Slot StringDType_Slots[] = {
269335
{NPY_DT_common_instance, &common_instance},
270336
{NPY_DT_common_dtype, &common_dtype},
@@ -278,6 +344,7 @@ static PyType_Slot StringDType_Slots[] = {
278344
{NPY_DT_PyArray_ArrFuncs_argmax, &argmax},
279345
{NPY_DT_PyArray_ArrFuncs_argmin, &argmin},
280346
{NPY_DT_get_clear_loop, &stringdtype_get_clear_loop},
347+
{NPY_DT_fill_zero_value, &stringdtype_fill_zero_value},
281348
{0, NULL}};
282349

283350
static PyObject *
@@ -441,3 +508,12 @@ init_string_dtype(void)
441508

442509
return 0;
443510
}
511+
512+
int
513+
init_string_na_object(PyObject *mod)
514+
{
515+
NA_OBJ = PyObject_GetAttrString(mod, "NA");
516+
StringNA_Type = Py_TYPE(NA_OBJ);
517+
Py_INCREF(StringNA_Type);
518+
return 0;
519+
}

stringdtype/stringdtype/src/dtype.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "numpy/arrayobject.h"
1313
#include "numpy/experimental_dtype_api.h"
1414
#include "numpy/ndarraytypes.h"
15+
#include "numpy/npy_math.h"
1516

1617
typedef struct {
1718
PyArray_Descr base;
@@ -29,6 +30,10 @@ init_string_dtype(void);
2930
int
3031
compare(void *, void *, void *);
3132

33+
int
34+
init_string_na_object(PyObject *mod);
35+
36+
3237
// from dtypemeta.h, not public in numpy
3338
#define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
3439

stringdtype/stringdtype/src/main.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ PyInit__main(void)
9191
if (_import_array() < 0) {
9292
return NULL;
9393
}
94-
if (import_experimental_dtype_api(9) < 0) {
94+
if (import_experimental_dtype_api(10) < 0) {
9595
return NULL;
9696
}
9797

@@ -117,6 +117,10 @@ PyInit__main(void)
117117
goto error;
118118
}
119119

120+
if (init_string_na_object(mod) < 0) {
121+
goto error;
122+
}
123+
120124
Py_INCREF((PyObject *)&StringDType);
121125
if (PyModule_AddObject(m, "StringDType", (PyObject *)&StringDType) < 0) {
122126
Py_DECREF((PyObject *)&StringDType);

0 commit comments

Comments
 (0)