Skip to content

Commit 29087f0

Browse files
committed
treat free blocks of each chunk as separate list
* they're still allocated together, but start/end of each chunk is tracked, and allocate/free iterate over sub-ranges * exhaust freed blocks of all chunks before considering their last blocks with unallocated space * start with 0 chunks/blocks and create chunks as needed * allow the last chunk to grow beyond max size
1 parent 57381c5 commit 29087f0

File tree

2 files changed

+160
-79
lines changed

2 files changed

+160
-79
lines changed

ggml/src/ggml-alloc.c

Lines changed: 107 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ struct free_block {
116116

117117
struct ggml_dyn_tallocr {
118118
size_t alignment;
119-
int n_free_blocks;
120119
int n_chunks;
120+
int free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS + 1]; // end[chunk] == begin[chunk+1]
121121
struct free_block free_blocks[MAX_FREE_BLOCKS];
122122
size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
123123
size_t max_chunk_size;
@@ -130,14 +130,31 @@ struct ggml_dyn_tallocr {
130130
#endif
131131
};
132132

133-
// allocations are split into n chunks of size max_size[i]. tensor allocations may not cross chunk boundaries.
134-
static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
135-
GGML_ASSERT(alloc->n_chunks >= 1);
136-
block->addr.chunk = alloc->n_chunks;
137-
block->addr.offset = 0;
138-
block->size = MAX(min_size, alloc->max_chunk_size);
139-
alloc->n_chunks++;
140-
GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
133+
struct free_block_range {
134+
int begin;
135+
int end;
136+
int size;
137+
};
138+
139+
static struct free_block_range ggml_dyn_tallocr_free_block_range(const struct ggml_dyn_tallocr * alloc, int chunk) {
140+
struct free_block_range range;
141+
range.begin = alloc->free_blocks_begin[chunk];
142+
range.end = alloc->free_blocks_begin[chunk + 1];
143+
range.size = range.end - range.begin;
144+
return range;
145+
}
146+
147+
void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
148+
int chunk = alloc->free_blocks[idx].addr.chunk;
149+
// shift all elements after idx by 1 to the left, overwriting the element at idx
150+
int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
151+
for (int i = idx; i < n_free_blocks; i++) {
152+
alloc->free_blocks[i] = alloc->free_blocks[i + 1];
153+
}
154+
// adjust first element index of all chunks after the current one
155+
for (int c = chunk + 1; c < alloc->n_chunks + 1; c++) {
156+
alloc->free_blocks_begin[c]--;
157+
}
141158
}
142159

143160
#ifdef GGML_ALLOCATOR_DEBUG
@@ -167,31 +184,62 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
167184

168185
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
169186

187+
int best_fit_block = -1;
170188
size_t max_avail = 0;
171189

172190
// find the best fitting free block besides the last block
173-
int best_fit_block = -1;
174-
size_t best_fit_size = SIZE_MAX;
175-
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
176-
struct free_block * block = &alloc->free_blocks[i];
177-
max_avail = MAX(max_avail, block->size);
178-
if (block->size >= size && block->size <= best_fit_size) {
179-
best_fit_block = i;
180-
best_fit_size = block->size;
191+
for (int c = 0; c < alloc->n_chunks; ++c) {
192+
struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
193+
size_t best_fit_size = SIZE_MAX;
194+
for (int i = blocks.begin; i < blocks.end - 1; i++) {
195+
struct free_block * block = &alloc->free_blocks[i];
196+
max_avail = MAX(max_avail, block->size);
197+
if (block->size >= size && block->size <= best_fit_size) {
198+
best_fit_block = i;
199+
best_fit_size = block->size;
200+
}
201+
}
202+
}
203+
204+
if (best_fit_block == -1) {
205+
// no suitable block found, try the last block (ie. growing a chunks size)
206+
for (int c = 0; c < alloc->n_chunks; ++c) {
207+
struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
208+
if (blocks.size > 0) {
209+
struct free_block * block = &alloc->free_blocks[blocks.end - 1];
210+
max_avail = MAX(max_avail, block->size);
211+
if (block->size >= size) {
212+
best_fit_block = blocks.end - 1;
213+
break;
214+
}
215+
}
181216
}
182217
}
183218

184219
if (best_fit_block == -1) {
185-
// the last block represents memory still available in an existing chunk
186-
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
187-
max_avail = MAX(max_avail, block->size);
188-
if (block->size < size) {
189-
// not enough space in existing chunk, start the next one
190-
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
191-
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks], size);
192-
alloc->n_free_blocks++;
220+
// none of the existing chunks have enough space left
221+
if (alloc->n_chunks < GGML_VBUFFER_MAX_CHUNKS) {
222+
// add a new chunk by creating a block of unclaimed space after the last chunk
223+
int i = alloc->free_blocks_begin[alloc->n_chunks];
224+
alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
225+
alloc->free_blocks[i].addr.offset = 0;
226+
// available space in a chunk is limited to max_chunk_size, but can be higher if:
227+
// 1. a single tensor exceeds the maximum, and cannot fit any other way
228+
// 2. we are running out of chunks
229+
// backends will either manage to allocate the larger size, or report an error.
230+
alloc->free_blocks[i].size = MAX(size, alloc->max_chunk_size);
231+
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
232+
alloc->free_blocks[i].size = SIZE_MAX/2;
233+
}
234+
alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
235+
alloc->n_chunks++;
236+
best_fit_block = i;
237+
} else {
238+
// since the last chunk always has virtually endless memory, this should never happen
239+
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
240+
__func__, size, max_avail);
241+
GGML_ABORT("graph allocation: failed to reserve memory");
193242
}
194-
best_fit_block = alloc->n_free_blocks - 1;
195243
}
196244

197245
struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -200,15 +248,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
200248
block->size -= size;
201249
if (block->size == 0) {
202250
// remove block if empty
203-
alloc->n_free_blocks--;
204-
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
205-
alloc->free_blocks[j] = alloc->free_blocks[j+1];
206-
}
207-
// if there are no remaining blocks all memory in current chunk was used up -> start the next one
208-
if (alloc->n_free_blocks == 0) {
209-
alloc->n_free_blocks = 1;
210-
ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
211-
}
251+
ggml_dyn_tallocr_remove_block(alloc, best_fit_block);
212252
}
213253

214254
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
@@ -255,31 +295,27 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
255295
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
256296
size = aligned_offset(NULL, size, alloc->alignment);
257297

258-
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, addr.chunk, addr.offset, size, alloc->n_free_blocks);
298+
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
299+
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->free_blocks_begin[alloc->n_chunks]);
259300

260301
#ifdef GGML_ALLOCATOR_DEBUG
261302
remove_allocated_tensor(alloc, addr, tensor);
262303
#endif
263304

305+
struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, addr.chunk);
306+
264307
// see if we can merge with an existing block
265-
for (int i = 0; i < alloc->n_free_blocks; i++) {
308+
for (int i = blocks.begin; i < blocks.end; i++) {
266309
struct free_block * block = &alloc->free_blocks[i];
267-
// can only merge with blocks within the same chunk
268-
if (addr.chunk != block->addr.chunk) {
269-
continue;
270-
}
271310
// check if ptr is at the end of the block
272311
if (block->addr.offset + block->size == addr.offset) {
273312
block->size += size;
274-
// check if we can merge with the next block (within the same chunk)
275-
if (i < alloc->n_free_blocks - 1) {
313+
// check if we can merge with the next block
314+
if (i < blocks.end - 1) {
276315
struct free_block * next = &alloc->free_blocks[i+1];
277-
if (block->addr.offset + block->size == next->addr.offset && block->addr.chunk == next->addr.chunk) {
316+
if (block->addr.offset + block->size == next->addr.offset) {
278317
block->size += next->size;
279-
alloc->n_free_blocks--;
280-
for (int j = i+1; j < alloc->n_free_blocks; j++) {
281-
alloc->free_blocks[j] = alloc->free_blocks[j+1];
282-
}
318+
ggml_dyn_tallocr_remove_block(alloc, i+1);
283319
}
284320
}
285321
return;
@@ -288,50 +324,46 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
288324
if (addr.offset + size == block->addr.offset) {
289325
block->addr.offset = addr.offset;
290326
block->size += size;
291-
// check if we can merge with the previous block (within the same chunk)
292-
if (i > 0) {
327+
// check if we can merge with the previous block
328+
if (i > blocks.begin) {
293329
struct free_block * prev = &alloc->free_blocks[i-1];
294-
if (prev->addr.offset + prev->size == block->addr.offset && prev->addr.chunk == block->addr.chunk) {
330+
if (prev->addr.offset + prev->size == block->addr.offset) {
295331
prev->size += block->size;
296-
alloc->n_free_blocks--;
297-
for (int j = i; j < alloc->n_free_blocks; j++) {
298-
alloc->free_blocks[j] = alloc->free_blocks[j+1];
299-
}
332+
ggml_dyn_tallocr_remove_block(alloc, i);
300333
}
301334
}
302335
return;
303336
}
304337
}
305338
// otherwise, add a new block
306-
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
339+
int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
340+
GGML_ASSERT(n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
307341
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
308-
int insert_pos = 0;
309-
while (insert_pos < alloc->n_free_blocks && ggml_buffer_address_less(alloc->free_blocks[insert_pos].addr, addr)) {
342+
int insert_pos = blocks.begin;
343+
while (insert_pos < blocks.end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
310344
insert_pos++;
311345
}
312346
// shift all blocks from insert_pos onward to make room for the new block
313-
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
347+
for (int i = n_free_blocks; i > insert_pos; i--) {
314348
alloc->free_blocks[i] = alloc->free_blocks[i-1];
315349
}
316350
// insert the new block
317351
alloc->free_blocks[insert_pos].addr = addr;
318352
alloc->free_blocks[insert_pos].size = size;
319-
alloc->n_free_blocks++;
353+
for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; c++) {
354+
alloc->free_blocks_begin[c]++;
355+
}
320356

321357
GGML_UNUSED(tensor);
322358
}
323359

324360
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
325-
alloc->n_free_blocks = 1;
326-
alloc->n_chunks = 1;
327-
alloc->free_blocks[0].addr.chunk = 0;
328-
alloc->free_blocks[0].addr.offset = 0;
329-
alloc->free_blocks[0].size = alloc->max_chunk_size;
330-
memset(alloc->max_size, 0, sizeof(alloc->max_size));
331-
332-
if (alloc->free_blocks[0].size == SIZE_MAX) {
333-
alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
361+
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
362+
alloc->free_blocks_begin[i] = 0;
363+
alloc->max_size[i] = 0;
334364
}
365+
alloc->free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS] = 0;
366+
alloc->n_chunks = 0;
335367

336368
#ifdef GGML_ALLOCATOR_DEBUG
337369
for (int i = 0; i < 1024; i++) {
@@ -344,12 +376,12 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
344376
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
345377

346378
*alloc = (struct ggml_dyn_tallocr) {
347-
/*.alignment = */ alignment,
348-
/*.n_free_blocks = */ 0,
349-
/*.n_chunks = */ 0,
350-
/*.free_blocks = */ {{{0}, 0}},
351-
/*.max_size = */ {0},
352-
/*.max_chunk_size = */ max_buffer_size,
379+
/*.alignment = */ alignment,
380+
/*.n_chunks = */ 0,
381+
/*.free_blocks_begin = */ {0},
382+
/*.free_blocks = */ {{{0}, 0}},
383+
/*.max_size = */ {0},
384+
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
353385
#ifdef GGML_ALLOCATOR_DEBUG
354386
/*.allocated_tensors = */ {{0}},
355387
#endif

tests/test-alloc.cpp

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ uint8_t * const alloc_base = (uint8_t *) 16;
1616

1717
struct dummy_backend_context {
1818
size_t max_buffer_size = 64;
19+
size_t alignment = 8;
1920

2021
ggml_backend_buffer_i buffer_interface;
2122
std::vector<ggml_backend_buffer_t> buffers;
@@ -42,8 +43,9 @@ static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend
4243
return buffer;
4344
}
4445

45-
static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t) {
46-
return 8;
46+
static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
47+
dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
48+
return ctx->alignment;
4749
}
4850

4951
static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -88,9 +90,10 @@ struct dummy_backend {
8890
ggml_backend_buffer_type buffer_type;
8991
};
9092

91-
static dummy_backend dummy_backend_init(size_t max_buffer_size) {
93+
static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
9294
dummy_backend b{};
9395
b.context = std::make_unique<dummy_backend_context>();
96+
b.context->alignment = alignment;
9497
b.context->max_buffer_size = max_buffer_size;
9598

9699
b.context->buffer_interface.free_buffer = dummy_backend_buffer_free_buffer;
@@ -121,7 +124,7 @@ struct test_context_with_graph {
121124

122125
static test_context_with_graph make_context() {
123126
ggml_init_params params{};
124-
params.mem_size = 32 * ggml_tensor_overhead() + ggml_graph_overhead();
127+
params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
125128
params.no_alloc = true;
126129

127130
ggml_context * ctx = ggml_init(params);
@@ -319,6 +322,32 @@ static void test_tensor_larger_than_max_size() {
319322
GGML_ASSERT(backend.context->allocated_total() == 24);
320323
}
321324

325+
// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
326+
// require more. Expectation is that the last buffer should grow to fit everything,
327+
// leaving it to the backend to error out if it can't allocate that much.
328+
static void test_not_enough_chunks() {
329+
const int max_chunks = 16;
330+
const int max_size = 8;
331+
332+
dummy_backend backend = dummy_backend_init(max_size);
333+
auto [ctx, graph, ctx_ptr] = make_context();
334+
335+
ggml_tensor * x[max_chunks + 1];
336+
for (int i = 0; i < max_chunks + 1; ++i) {
337+
x[i] = make_input_with_size(ctx, max_size);
338+
}
339+
ggml_tensor * acc = x[0];
340+
for (int i = 0; i < max_chunks; ++i) {
341+
acc = ggml_add(ctx, acc, x[i + 1]);
342+
}
343+
assign_names(ctx);
344+
345+
ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
346+
check_all_allocated(graph);
347+
check_no_overlap(graph);
348+
GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
349+
}
350+
322351
// Fill up leftover unallocated space of a chunk after allocating a large tensor that
323352
// requires a new chunk.
324353
static void test_fill_leftover_space() {
@@ -405,6 +434,24 @@ static void test_merge_free_block(size_t max_buffer_size) {
405434
GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
406435
}
407436

437+
// Check that previously allocated but freed memory is preferred over allocating
438+
// additional memory, even if the remaining space in a chunk would match tensor size better
439+
static void test_prefer_already_allocated_memory() {
440+
dummy_backend backend = dummy_backend_init(32, /*align*/ 4);
441+
auto [ctx, graph, ctx_ptr] = make_context();
442+
443+
ggml_tensor * x[3];
444+
x[0] = make_input_with_size(ctx, 24); // [24b][8b unused]
445+
x[1] = ggml_mean(ctx, x[0]); // [24b free][4b][4b unused]
446+
x[2] = ggml_mean(ctx, x[1]); // should be allocated in the 24b block
447+
assign_names(ctx);
448+
449+
ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
450+
check_all_allocated(graph);
451+
check_no_overlap(graph);
452+
GGML_ASSERT(backend.context->allocated_total() <= 28);
453+
}
454+
408455
// test for allocating on multiple devices with some tensors in the graph
409456
// allocated externally (not by gallocr).
410457
static void test_multiple_buffer_types() {
@@ -512,11 +559,13 @@ int main() {
512559
run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
513560
run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
514561
run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
562+
run("test_not_enough_chunks", test_not_enough_chunks);
515563
run("test_fill_leftover_space", test_fill_leftover_space);
516564
run("test_view_inplace", test_view_inplace);
517565
run("test_reuse_and_free", test_reuse_and_free);
518566
run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
519567
run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
568+
run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
520569
run("test_multiple_buffer_types", test_multiple_buffer_types);
521570
run("test_buffer_size_zero", test_buffer_size_zero);
522571
return 0;

0 commit comments

Comments
 (0)