Skip to content

Commit e663f0f

Browse files
authored
Merge pull request #93 from ngoldbaum/slim-memory-usage
Add a medium length string mode
2 parents 83e43f4 + 77f8c3c commit e663f0f

File tree

3 files changed

+63
-21
lines changed

3 files changed

+63
-21
lines changed

stringdtype/stringdtype/src/static_string.c

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,14 @@ typedef union _npy_static_string_u {
4444
#define NPY_STRING_SHORT 0x40 // 0100 0000
4545
#define NPY_STRING_ARENA_FREED 0x20 // 0010 0000
4646
#define NPY_STRING_ON_HEAP 0x10 // 0001 0000
47+
#define NPY_STRING_MEDIUM 0x08 // 0000 1000
48+
#define NPY_STRING_FLAG_MASK 0xF8 // 1111 1000
4749

4850
// short string sizes fit in a 4-bit integer
4951
#define NPY_SHORT_STRING_SIZE_MASK 0x0F // 0000 1111
5052
#define NPY_SHORT_STRING_MAX_SIZE \
51-
(sizeof(npy_static_string) - 1) // 15 or 7 depending on arch
53+
(sizeof(npy_static_string) - 1) // 15 or 7 depending on arch
54+
#define NPY_MEDIUM_STRING_MAX_SIZE 0xFF // 256
5255

5356
// Since this has no flags set, technically this is a heap-allocated string
5457
// with size zero. Practically, that doesn't matter because we always do size
@@ -86,8 +89,7 @@ struct npy_string_allocator {
8689
void
8790
set_vstring_size(_npy_static_string_u *str, size_t size)
8891
{
89-
unsigned char *flags = &str->direct_buffer.flags_and_size;
90-
unsigned char current_flags = *flags & ~NPY_SHORT_STRING_SIZE_MASK;
92+
unsigned char current_flags = str->direct_buffer.flags_and_size;
9193
str->vstring.size = size;
9294
str->direct_buffer.flags_and_size = current_flags;
9395
}
@@ -110,9 +112,13 @@ npy_string_arena_malloc(npy_string_arena *arena, npy_string_realloc_func r,
110112
size_t size)
111113
{
112114
// one extra size_t to store the size of the allocation
113-
size_t string_storage_size = size + sizeof(size_t);
114-
// expand size to nearest multiple of 8 bytes to ensure 64 bit alignment
115-
string_storage_size += (8 - string_storage_size % 8);
115+
size_t string_storage_size;
116+
if (size <= NPY_MEDIUM_STRING_MAX_SIZE) {
117+
string_storage_size = size + sizeof(unsigned char);
118+
}
119+
else {
120+
string_storage_size = size + sizeof(size_t);
121+
}
116122
if ((arena->size - arena->cursor) <= string_storage_size) {
117123
// realloc the buffer so there is enough room
118124
// first guess is to double the size of the buffer
@@ -130,7 +136,7 @@ npy_string_arena_malloc(npy_string_arena *arena, npy_string_realloc_func r,
130136
// doubling the current size isn't enough
131137
newsize = 2 * (arena->cursor + size);
132138
}
133-
// realloc passed a NULL pointer acts like malloc
139+
// passing a NULL buffer to realloc is the same as malloc
134140
char *newbuf = r(arena->buffer, newsize);
135141
if (newbuf == NULL) {
136142
return NULL;
@@ -139,9 +145,18 @@ npy_string_arena_malloc(npy_string_arena *arena, npy_string_realloc_func r,
139145
arena->buffer = newbuf;
140146
arena->size = newsize;
141147
}
142-
size_t *size_loc = (size_t *)&arena->buffer[arena->cursor];
143-
*size_loc = size;
144-
char *ret = &arena->buffer[arena->cursor + sizeof(size_t)];
148+
char *ret;
149+
if (size <= NPY_MEDIUM_STRING_MAX_SIZE) {
150+
unsigned char *size_loc =
151+
(unsigned char *)&arena->buffer[arena->cursor];
152+
*size_loc = size;
153+
ret = &arena->buffer[arena->cursor + sizeof(char)];
154+
}
155+
else {
156+
char *size_ptr = (char *)&arena->buffer[arena->cursor];
157+
memcpy(size_ptr, &size, sizeof(size_t));
158+
ret = &arena->buffer[arena->cursor + sizeof(size_t)];
159+
}
145160
arena->cursor += string_storage_size;
146161
return ret;
147162
}
@@ -207,6 +222,15 @@ is_short_string(const npy_packed_static_string *s)
207222
return has_short_flag && !has_on_heap_flag;
208223
}
209224

225+
int
226+
is_medium_string(const _npy_static_string_u *s)
227+
{
228+
unsigned char high_byte = s->direct_buffer.flags_and_size;
229+
int has_short_flag = (high_byte & NPY_STRING_SHORT);
230+
int has_medium_flag = (high_byte & NPY_STRING_MEDIUM);
231+
return (!has_short_flag && has_medium_flag);
232+
}
233+
210234
int
211235
npy_string_isnull(const npy_packed_static_string *s)
212236
{
@@ -286,10 +310,19 @@ heap_or_arena_allocate(npy_string_allocator *allocator,
286310
if (buf == NULL) {
287311
return NULL;
288312
}
289-
size_t alloc_size = *((size_t *)(buf - 1));
313+
size_t alloc_size;
314+
if (is_medium_string(to_init_u)) {
315+
// stored in a char so direct access is OK
316+
alloc_size = (size_t) * (buf - 1);
317+
}
318+
else {
319+
// not necessarily memory-aligned, so need to use memcpy
320+
size_t *size_loc = (size_t *)((uintptr_t)buf - sizeof(size_t));
321+
memcpy(&alloc_size, size_loc, sizeof(size_t));
322+
}
290323
if (size <= alloc_size) {
291324
// we have room!
292-
*flags = NPY_STRING_ARENA_FREED;
325+
*flags &= ~NPY_STRING_ARENA_FREED;
293326
return buf;
294327
}
295328
else {
@@ -316,8 +349,12 @@ heap_or_arena_allocate(npy_string_allocator *allocator,
316349
if (arena == NULL) {
317350
return NULL;
318351
}
319-
return npy_string_arena_malloc(arena, allocator->realloc,
320-
sizeof(char) * size);
352+
char *ret = npy_string_arena_malloc(arena, allocator->realloc,
353+
sizeof(char) * size);
354+
if (size < NPY_MEDIUM_STRING_MAX_SIZE) {
355+
*flags |= NPY_STRING_MEDIUM;
356+
}
357+
return ret;
321358
}
322359

323360
int

stringdtype/tests/test_char.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44

55
from stringdtype import StringDType
66

7-
TEST_DATA = ["hello", "Ae¢☃€ 😊", "entry\nwith\nnewlines", "entry\twith\ttabs"]
7+
TEST_DATA = [
8+
"hello" * 10,
9+
"Ae¢☃€ 😊" * 100,
10+
"entry\nwith\nnewlines",
11+
"entry\twith\ttabs",
12+
]
813

914

1015
@pytest.fixture
@@ -94,11 +99,11 @@ def test_binary(string_array, unicode_array, function_name, args):
9499

95100

96101
def test_strip(string_array, unicode_array):
97-
rjs = np.char.rjust(string_array, 25)
98-
rju = np.char.rjust(unicode_array, 25)
102+
rjs = np.char.rjust(string_array, 1000)
103+
rju = np.char.rjust(unicode_array, 1000)
99104

100-
ljs = np.char.ljust(string_array, 25)
101-
lju = np.char.ljust(unicode_array, 25)
105+
ljs = np.char.ljust(string_array, 1000)
106+
lju = np.char.ljust(unicode_array, 1000)
102107

103108
assert_array_equal(
104109
np.char.lstrip(rjs),

stringdtype/tests/test_stringdtype.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
@pytest.fixture
1919
def string_list():
20-
return ["abc", "def", "ghi" * 10, "A¢☃€ 😊", "Abc", "DEF"]
20+
return ["abc", "def", "ghi" * 10, "A¢☃€ 😊" * 100, "Abc" * 1000, "DEF"]
2121

2222

2323
pd_param = pytest.param(
@@ -121,7 +121,7 @@ def test_array_creation_utf8(dtype, data):
121121
def test_array_creation_scalars(string_list):
122122
arr = np.array([StringScalar(s) for s in string_list])
123123
assert (
124-
str(arr)
124+
str(arr).replace("\n", "")
125125
== "[" + " ".join(["'" + str(s) + "'" for s in string_list]) + "]"
126126
)
127127
assert arr.dtype == StringDType()

0 commit comments

Comments
 (0)