Skip to content

Commit 39be9fe

Browse files
committed
don't store terminating NULLs in the string storage
1 parent 255cfa5 commit 39be9fe

File tree

6 files changed

+94
-47
lines changed

6 files changed

+94
-47
lines changed

stringdtype/stringdtype/src/casts.c

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,6 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
235235
// reset out_buf to the beginning of the string
236236
out_buf -= out_num_bytes;
237237

238-
// pad string with null character
239-
out_buf[out_num_bytes] = '\0';
240-
241238
in += in_stride;
242239
out += out_stride;
243240
}
@@ -287,8 +284,12 @@ string_to_unicode_resolve_descriptors(PyObject *NPY_UNUSED(self),
287284
// bytes. Does not do any validation or error checking: assumes *c* is valid
288285
// utf-8
289286
size_t
290-
utf8_char_to_ucs4_code(unsigned char *c, Py_UCS4 *code)
287+
utf8_char_to_ucs4_code(unsigned char *c, size_t len, Py_UCS4 *code)
291288
{
289+
if (len == 0) {
290+
*code = (Py_UCS4)0;
291+
return 0;
292+
}
292293
if (c[0] <= 0x7F) {
293294
// 0zzzzzzz -> 0zzzzzzz
294295
*code = (Py_UCS4)(c[0]);
@@ -402,7 +403,8 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
402403

403404
// get code point for character this_string is currently pointing
404405
// too
405-
size_t num_bytes = utf8_char_to_ucs4_code(this_string, &code);
406+
size_t num_bytes =
407+
utf8_char_to_ucs4_code(this_string, n_bytes, &code);
406408

407409
// move to next character
408410
this_string += num_bytes;
@@ -590,8 +592,14 @@ string_to_pylong(char *in, int hasnull)
590592
}
591593
s = &EMPTY_STRING;
592594
}
595+
PyObject *val_obj = PyUnicode_FromStringAndSize(s->buf, s->len);
596+
if (val_obj == NULL) {
597+
return NULL;
598+
}
593599
// interpret as an integer in base 10
594-
return PyLong_FromString(s->buf, NULL, 10);
600+
PyObject *pylong_value = PyLong_FromUnicodeObject(val_obj, 10);
601+
Py_DECREF(val_obj);
602+
return pylong_value;
595603
}
596604

597605
static npy_longlong

stringdtype/stringdtype/src/casts.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,4 @@
1414
PyArrayMethod_Spec **
1515
get_casts();
1616

17-
size_t
18-
utf8_char_to_ucs4_code(unsigned char *, Py_UCS4 *);
19-
2017
#endif /* _NPY_CASTS_H */

stringdtype/stringdtype/src/dtype.c

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,7 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
2525
int has_string_na = 0;
2626
ss default_string = EMPTY_STRING;
2727
if (hasnull) {
28-
double na_float = PyFloat_AsDouble(na_object);
29-
if (na_float == -1.0 && PyErr_Occurred()) {
30-
// not a float, still treat as nan if PyObject_IsTrue raises
31-
// (e.g. pandas.NA)
32-
PyErr_Clear();
33-
int is_truthy = PyObject_IsTrue(na_object);
34-
if (is_truthy == -1) {
35-
PyErr_Clear();
36-
has_nan_na = 1;
37-
}
38-
}
39-
else if (npy_isnan(na_float)) {
40-
has_nan_na = 1;
41-
}
42-
28+
// first check for a string
4329
if (PyUnicode_Check(na_object)) {
4430
has_string_na = 1;
4531
Py_ssize_t size = 0;
@@ -48,6 +34,25 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
4834
// discards const, how to avoid?
4935
default_string.buf = (char *)buf;
5036
}
37+
else {
38+
// treat as nan-like if != comparison returns a object whose truth
39+
// value raises an error (pd.NA) or a truthy value (e.g. a
40+
// NaN-like object)
41+
PyObject *eq = PyObject_RichCompare(na_object, na_object, Py_NE);
42+
if (eq == NULL) {
43+
Py_DECREF(new);
44+
return NULL;
45+
}
46+
int is_truthy = PyObject_IsTrue(na_object);
47+
if (is_truthy == -1) {
48+
PyErr_Clear();
49+
has_nan_na = 1;
50+
}
51+
else if (is_truthy == 1) {
52+
has_nan_na = 1;
53+
}
54+
Py_DECREF(eq);
55+
}
5156
}
5257
((StringDTypeObject *)new)->has_nan_na = has_nan_na;
5358
((StringDTypeObject *)new)->has_string_na = has_string_na;
@@ -60,6 +65,9 @@ new_stringdtype_instance(PyObject *na_object, int coerce)
6065
base->flags |= NPY_NEEDS_INIT;
6166
base->flags |= NPY_LIST_PICKLE;
6267
base->flags |= NPY_ITEM_REFCOUNT;
68+
// this is only because of error propagation in sorting, once this dtype
69+
// lives inside numpy we can relax this and patch the sorting code
70+
// directly.
6371
if (hasnull && !(has_string_na && has_nan_na)) {
6472
base->flags |= NPY_NEEDS_PYAPI;
6573
}
@@ -302,7 +310,7 @@ _compare(void *a, void *b, StringDTypeObject *descr)
302310
}
303311
}
304312
}
305-
return strcmp(ss_a->buf, ss_b->buf);
313+
return sscmp(ss_a, ss_b);
306314
}
307315

308316
// PyArray_ArgFunc

stringdtype/stringdtype/src/static_string.c

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include "static_string.h"
22

3-
const ss EMPTY_STRING = {0, ""};
3+
// defined this way so it has an in-memory representation that is distinct
4+
// from NULL, allowing us to use NULL to represent a sentinel value
5+
const ss EMPTY_STRING = {0, "\0"};
46

57
int
68
ssnewlen(const char *init, size_t len, ss *to_init)
@@ -10,24 +12,19 @@ ssnewlen(const char *init, size_t len, ss *to_init)
1012
}
1113

1214
if (len == 0) {
13-
to_init->len = 0;
14-
to_init->buf = EMPTY_STRING.buf;
15+
*to_init = EMPTY_STRING;
16+
return 0;
1517
}
1618

17-
// one extra byte for null terminator
18-
char *ret_buf = (char *)malloc(sizeof(char) * (len + 1));
19+
char *ret_buf = (char *)malloc(sizeof(char) * len);
1920

2021
if (ret_buf == NULL) {
2122
return -1;
2223
}
2324

2425
to_init->len = len;
2526

26-
if (len > 0) {
27-
memcpy(ret_buf, init, len);
28-
}
29-
30-
ret_buf[len] = '\0';
27+
memcpy(ret_buf, init, len);
3128

3229
to_init->buf = ret_buf;
3330

@@ -37,10 +34,8 @@ ssnewlen(const char *init, size_t len, ss *to_init)
3734
void
3835
ssfree(ss *str)
3936
{
40-
if (str->buf != NULL) {
41-
if (str->buf != EMPTY_STRING.buf) {
42-
free(str->buf);
43-
}
37+
if (str->buf != NULL && str->buf != EMPTY_STRING.buf) {
38+
free(str->buf);
4439
str->buf = NULL;
4540
}
4641
str->len = 0;
@@ -66,18 +61,44 @@ ssnewemptylen(size_t num_bytes, ss *out)
6661
return -2;
6762
}
6863

69-
char *buf = (char *)malloc(sizeof(char) * (num_bytes + 1));
64+
out->len = num_bytes;
65+
66+
if (num_bytes == 0) {
67+
*out = EMPTY_STRING;
68+
return 0;
69+
}
70+
71+
char *buf = (char *)malloc(sizeof(char) * num_bytes);
7072

7173
if (buf == NULL) {
7274
return -1;
7375
}
7476

7577
out->buf = buf;
76-
out->len = num_bytes;
7778

7879
return 0;
7980
}
8081

82+
// same semantics as strcmp
83+
int
84+
sscmp(const ss *s1, const ss *s2)
85+
{
86+
size_t minlen = s1->len < s2->len ? s1->len : s2->len;
87+
88+
int cmp = strncmp(s1->buf, s2->buf, minlen);
89+
90+
if (cmp == 0) {
91+
if (s1->len > minlen) {
92+
return 1;
93+
}
94+
if (s2->len > minlen) {
95+
return -1;
96+
}
97+
}
98+
99+
return cmp;
100+
}
101+
81102
int
82103
ss_isnull(const ss *in)
83104
{
@@ -86,3 +107,13 @@ ss_isnull(const ss *in)
86107
}
87108
return 0;
88109
}
110+
111+
const char *
112+
ss_data(const ss *in, const char *default_str)
113+
{
114+
if (ss_isnull(in)) {
115+
return default_str;
116+
}
117+
118+
return in->buf;
119+
}

stringdtype/stringdtype/src/static_string.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,9 @@ ssnewemptylen(size_t num_bytes, ss *out);
4141
int
4242
ss_isnull(const ss *in);
4343

44+
// Compare two strings. Has the same sematics as strcmp passed null-terminated
45+
// C strings with the content of *s1* and *s2*.
46+
int
47+
sscmp(const ss *s1, const ss *s2);
48+
4449
#endif /*_NPY_STATIC_STRING_H */

stringdtype/stringdtype/src/umath.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ multiply_resolve_descriptors(
7676
for (size_t i = 0; i < (size_t)factor; i++) { \
7777
memcpy(os->buf + i * is->len, is->buf, is->len); \
7878
} \
79-
os->buf[newlen] = '\0'; \
8079
\
8180
sin += s_stride; \
8281
iin += i_stride; \
@@ -246,7 +245,6 @@ add_strided_loop(PyArrayMethod_Context *context, char *const data[],
246245

247246
memcpy(os->buf, s1->buf, s1->len);
248247
memcpy(os->buf + s1->len, s2->buf, s2->len);
249-
os->buf[newlen] = '\0';
250248

251249
next_step:
252250
in1 += in1_stride;
@@ -508,7 +506,7 @@ string_greater_strided_loop(PyArrayMethod_Context *context, char *const data[],
508506
}
509507
}
510508
}
511-
if (strcmp(s1->buf, s2->buf) > 0) {
509+
if (sscmp(s1, s2) > 0) {
512510
*out = (npy_bool)1;
513511
}
514512
else {
@@ -571,7 +569,7 @@ string_greater_equal_strided_loop(PyArrayMethod_Context *context,
571569
}
572570
}
573571
}
574-
if (strcmp(s1->buf, s2->buf) >= 0) {
572+
if (sscmp(s1, s2) >= 0) {
575573
*out = (npy_bool)1;
576574
}
577575
else {
@@ -632,7 +630,7 @@ string_less_strided_loop(PyArrayMethod_Context *context, char *const data[],
632630
}
633631
}
634632
}
635-
if (strcmp(s1->buf, s2->buf) < 0) {
633+
if (sscmp(s1, s2) < 0) {
636634
*out = (npy_bool)1;
637635
}
638636
else {
@@ -694,7 +692,7 @@ string_less_equal_strided_loop(PyArrayMethod_Context *context,
694692
}
695693
}
696694
}
697-
if (strcmp(s1->buf, s2->buf) <= 0) {
695+
if (sscmp(s1, s2) <= 0) {
698696
*out = (npy_bool)1;
699697
}
700698
else {

0 commit comments

Comments
 (0)