Skip to content

Commit 7499253

Browse files
committed
store string data in a struct along with length
1 parent f007c97 commit 7499253

File tree

6 files changed

+104
-25
lines changed

6 files changed

+104
-25
lines changed

stringdtype/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ srcs = [
2626
'stringdtype/src/casts.h',
2727
'stringdtype/src/dtype.c',
2828
'stringdtype/src/main.c',
29+
'stringdtype/src/static_string.c',
30+
'stringdtype/src/static_string.h',
2931
'stringdtype/src/umath.c',
3032
'stringdtype/src/umath.h',
3133
]

stringdtype/stringdtype/src/casts.c

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "casts.h"
22

33
#include "dtype.h"
4+
#include "static_string.h"
45

56
static NPY_CASTING
67
string_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
@@ -35,17 +36,15 @@ string_to_string(PyArrayMethod_Context *context, char *const data[],
3536
NpyAuxData *NPY_UNUSED(auxdata))
3637
{
3738
npy_intp N = dimensions[0];
38-
char **in = (char **)data[0];
39-
char **out = (char **)data[1];
39+
ss **in = (ss **)data[0];
40+
ss **out = (ss **)data[1];
4041
// strides are in bytes but pointer offsets are in pointer widths, so
4142
// divide by the element size (one pointer width) to get the pointer offset
4243
npy_intp in_stride = strides[0] / context->descriptors[0]->elsize;
4344
npy_intp out_stride = strides[1] / context->descriptors[1]->elsize;
4445

4546
while (N--) {
46-
size_t length = strlen(*in);
47-
out[0] = (char *)malloc((sizeof(char) * length) + 1);
48-
strncpy(*out, *in, length + 1);
47+
out[0] = ssdup(in[0]);
4948
in += in_stride;
5049
out += out_stride;
5150
}
@@ -189,7 +188,7 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
189188

190189
npy_intp N = dimensions[0];
191190
Py_UCS4 *in = (Py_UCS4 *)data[0];
192-
char **out = (char **)data[1];
191+
ss **out = (ss **)data[1];
193192

194193
// 4 bytes per UCS4 character
195194
npy_intp in_stride = strides[0] / 4;
@@ -210,8 +209,8 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
210209
PyGILState_Release(gstate);
211210
return -1;
212211
}
213-
// one extra byte for null terminator
214-
char *out_buf = malloc((out_num_bytes + 1) * sizeof(char));
212+
ss *out_ss = ssnewempty(out_num_bytes);
213+
char *out_buf = out_ss->buf;
215214
for (int i = 0; i < num_codepoints; i++) {
216215
// get code point
217216
Py_UCS4 code = in[i];
@@ -237,7 +236,7 @@ unicode_to_string(PyArrayMethod_Context *context, char *const data[],
237236
out_buf[out_num_bytes] = '\0';
238237

239238
// set out to the address of the beginning of the string
240-
out[0] = out_buf;
239+
out[0] = out_ss;
241240

242241
in += in_stride;
243242
out += out_stride;
@@ -318,7 +317,7 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
318317
NpyAuxData *NPY_UNUSED(auxdata))
319318
{
320319
npy_intp N = dimensions[0];
321-
char **in = (char **)data[0];
320+
ss **in = (ss **)data[0];
322321
Py_UCS4 *out = (Py_UCS4 *)data[1];
323322
// strides are in bytes but pointer offsets are in pointer widths, so
324323
// divide by the element size (one pointer width) to get the pointer offset
@@ -329,7 +328,9 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
329328
long max_out_size = (context->descriptors[1]->elsize) / 4;
330329

331330
while (N--) {
332-
unsigned char *this_string = (unsigned char *)*in;
331+
unsigned char *this_string = (unsigned char *)((*in)->buf);
332+
size_t n_bytes = (*in)->len;
333+
size_t tot_n_bytes = 0;
333334

334335
for (int i = 0; i < max_out_size; i++) {
335336
Py_UCS4 code;
@@ -340,16 +341,13 @@ string_to_unicode(PyArrayMethod_Context *context, char *const data[],
340341

341342
// move to next character
342343
this_string += num_bytes;
344+
tot_n_bytes += num_bytes;
343345

344346
// set output codepoint
345347
out[i] = code;
346348

347-
// check if this is the null terminator
348-
if (code == 0) {
349-
// fill all remaining characters (if any) with zero
350-
for (int j = i + 1; j < max_out_size; j++) {
351-
out[j] = 0;
352-
}
349+
// stop if we've exhausted the input string
350+
if (tot_n_bytes >= n_bytes) {
353351
break;
354352
}
355353
}

stringdtype/stringdtype/src/dtype.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "dtype.h"
22

33
#include "casts.h"
4+
#include "static_string.h"
45

56
PyTypeObject *StringScalar_Type = NULL;
67

@@ -15,8 +16,8 @@ new_stringdtype_instance(void)
1516
if (new == NULL) {
1617
return NULL;
1718
}
18-
new->base.elsize = sizeof(char *);
19-
new->base.alignment = _Alignof(char *);
19+
new->base.elsize = sizeof(ss *);
20+
new->base.alignment = _Alignof(ss *);
2021

2122
return new;
2223
}
@@ -113,16 +114,15 @@ stringdtype_setitem(StringDTypeObject *descr, PyObject *obj, char **dataptr)
113114
return -1;
114115
}
115116

116-
*dataptr = malloc(sizeof(char) * length + 1);
117-
strncpy(*dataptr, val, length + 1);
117+
*dataptr = (char *)ssnewlen(val, length);
118118
Py_DECREF(val_obj);
119119
return 0;
120120
}
121121

122122
static PyObject *
123123
stringdtype_getitem(StringDTypeObject *descr, char **dataptr)
124124
{
125-
PyObject *val_obj = PyUnicode_FromString(*dataptr);
125+
PyObject *val_obj = PyUnicode_FromString(((ss *)*dataptr)->buf);
126126

127127
if (val_obj == NULL) {
128128
return NULL;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#include "static_string.h"
2+
3+
// allocates a new ss string of length len, filling with the contents of init
4+
ss *
5+
ssnewlen(const char *init, size_t len)
6+
{
7+
// one extra byte for null terminator
8+
ss *ret = (ss *)malloc(sizeof(ss) + sizeof(char) * (len + 1));
9+
10+
if (ret == NULL) {
11+
return NULL;
12+
}
13+
14+
ret->len = len;
15+
16+
if (len > 0) {
17+
memcpy(ret->buf, init, len);
18+
}
19+
20+
ret->buf[len] = '\0';
21+
22+
return ret;
23+
}
24+
25+
// returns a new heap-allocated copy of input string *s*
26+
ss *
27+
ssdup(const ss *s)
28+
{
29+
return ssnewlen(s->buf, s->len);
30+
}
31+
32+
// returns a new, empty string of length len
33+
// does not do any initialization, the caller must
34+
// initialize and null-terminate the string
35+
ss *
36+
ssnewempty(size_t len)
37+
{
38+
ss *ret = (ss *)malloc(sizeof(ss) + sizeof(char) * (len + 1));
39+
ret->len = len;
40+
return ret;
41+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#ifndef _NPY_STATIC_STRING_H
2+
#define _NPY_STATIC_STRING_H
3+
4+
#include "stdlib.h"
5+
#include "string.h"
6+
7+
typedef struct ss {
8+
size_t len;
9+
char buf[];
10+
} ss;
11+
12+
// allocates a new ss string of length len, filling with the contents of init
13+
ss *
14+
ssnewlen(const char *init, size_t len);
15+
16+
// returns a new heap-allocated copy of input string *s*
17+
ss *
18+
ssdup(const ss *s);
19+
20+
// returns a new, empty string of length len
21+
// does not do any initialization, the caller must
22+
// initialize and null-terminate the string
23+
ss *
24+
ssnewempty(size_t len);
25+
26+
#endif /*_NPY_STATIC_STRING_H */

stringdtype/stringdtype/src/umath.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "numpy/ufuncobject.h"
1010

1111
#include "dtype.h"
12+
#include "static_string.h"
1213
#include "string.h"
1314
#include "umath.h"
1415

@@ -19,8 +20,8 @@ string_equal_strided_loop(PyArrayMethod_Context *context, char *const data[],
1920
NpyAuxData *NPY_UNUSED(auxdata))
2021
{
2122
npy_intp N = dimensions[0];
22-
char **in1 = (char **)data[0];
23-
char **in2 = (char **)data[1];
23+
ss **in1 = (ss **)data[0];
24+
ss **in2 = (ss **)data[1];
2425
npy_bool *out = (npy_bool *)data[2];
2526
// strides are in bytes but pointer offsets are in pointer widths, so
2627
// divide by the element size (one pointer width) to get the pointer offset
@@ -29,7 +30,18 @@ string_equal_strided_loop(PyArrayMethod_Context *context, char *const data[],
2930
npy_intp out_stride = strides[2];
3031

3132
while (N--) {
32-
if (strcmp(*in1, *in2) == 0) {
33+
size_t len1 = (*in1)->len;
34+
size_t len2 = (*in2)->len;
35+
size_t maxlen;
36+
37+
if (len1 > len2) {
38+
maxlen = len1;
39+
}
40+
else {
41+
maxlen = len2;
42+
}
43+
44+
if (strncmp((*in1)->buf, (*in2)->buf, maxlen) == 0) {
3345
*out = (npy_bool)1;
3446
}
3547
else {

0 commit comments

Comments
 (0)