treat free blocks of each chunk as separate list

Acly · Acly · commit 29087f09083c · 2025-09-24T10:22:04.000+02:00
* they're still allocated together, but start/end of each chunk is tracked, and allocate/free iterate over sub-ranges
* exhaust freed blocks of all chunks before considering their last blocks with unallocated space
* start with 0 chunks/blocks and create chunks as needed
* allow the last chunk to grow beyond max size
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
@@ -116,8 +116,8 @@ struct free_block {
 
 struct ggml_dyn_tallocr {
     size_t alignment;
-    int n_free_blocks;
     int n_chunks;
+    int free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS + 1]; // end[chunk] == begin[chunk+1]
     struct free_block free_blocks[MAX_FREE_BLOCKS];
     size_t max_size[GGML_VBUFFER_MAX_CHUNKS];
     size_t max_chunk_size;
@@ -130,14 +130,31 @@ struct ggml_dyn_tallocr {
 #endif
 };
 
-// allocations are split into n chunks of size max_size[i]. tensor allocations may not cross chunk boundaries.
-static void ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, struct free_block * block, size_t min_size) {
-    GGML_ASSERT(alloc->n_chunks >= 1);
-    block->addr.chunk = alloc->n_chunks;
-    block->addr.offset = 0;
-    block->size = MAX(min_size, alloc->max_chunk_size);
-    alloc->n_chunks++;
-    GGML_ASSERT(alloc->n_chunks <= GGML_VBUFFER_MAX_CHUNKS);
+struct free_block_range {
+    int begin;
+    int end;
+    int size;
+};
+
+static struct free_block_range ggml_dyn_tallocr_free_block_range(const struct ggml_dyn_tallocr * alloc, int chunk) {
+    struct free_block_range range;
+    range.begin = alloc->free_blocks_begin[chunk];
+    range.end   = alloc->free_blocks_begin[chunk + 1];
+    range.size  = range.end - range.begin;
+    return range;
+}
+
+void ggml_dyn_tallocr_remove_block(struct ggml_dyn_tallocr * alloc, int idx) {
+    int chunk = alloc->free_blocks[idx].addr.chunk;
+    // shift all elements after idx by 1 to the left, overwriting the element at idx
+    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
+    for (int i = idx; i < n_free_blocks; i++) {
+        alloc->free_blocks[i] = alloc->free_blocks[i + 1];
+    }
+    // adjust first element index of all chunks after the current one
+    for (int c = chunk + 1; c < alloc->n_chunks + 1; c++) {
+        alloc->free_blocks_begin[c]--;
+    }
 }
 
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -167,31 +184,62 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 
     AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 
+    int best_fit_block = -1;
     size_t max_avail = 0;
 
     // find the best fitting free block besides the last block
-    int best_fit_block = -1;
-    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size && block->size <= best_fit_size) {
-            best_fit_block = i;
-            best_fit_size = block->size;
+    for (int c = 0; c < alloc->n_chunks; ++c) {
+        struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
+        size_t best_fit_size = SIZE_MAX;
+        for (int i = blocks.begin; i < blocks.end - 1; i++) {
+            struct free_block * block = &alloc->free_blocks[i];
+            max_avail = MAX(max_avail, block->size);
+            if (block->size >= size && block->size <= best_fit_size) {
+                best_fit_block = i;
+                best_fit_size = block->size;
+            }
+        }
+    }
+
+    if (best_fit_block == -1) {
+        // no suitable block found, try the last block (ie. growing a chunks size)
+        for (int c = 0; c < alloc->n_chunks; ++c) {
+            struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, c);
+            if (blocks.size > 0) {
+                struct free_block * block = &alloc->free_blocks[blocks.end - 1];
+                max_avail = MAX(max_avail, block->size);
+                if (block->size >= size) {
+                    best_fit_block = blocks.end - 1;
+                    break;
+                }
+            }
         }
     }
 
     if (best_fit_block == -1) {
-        // the last block represents memory still available in an existing chunk
-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size < size) {
-            // not enough space in existing chunk, start the next one
-            GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[alloc->n_free_blocks], size);
-            alloc->n_free_blocks++;
+        // none of the existing chunks have enough space left
+        if (alloc->n_chunks < GGML_VBUFFER_MAX_CHUNKS) {
+            // add a new chunk by creating a block of unclaimed space after the last chunk
+            int i = alloc->free_blocks_begin[alloc->n_chunks];
+            alloc->free_blocks[i].addr.chunk = alloc->n_chunks;
+            alloc->free_blocks[i].addr.offset = 0;
+            // available space in a chunk is limited to max_chunk_size, but can be higher if:
+            // 1. a single tensor exceeds the maximum, and cannot fit any other way
+            // 2. we are running out of chunks
+            // backends will either manage to allocate the larger size, or report an error.
+            alloc->free_blocks[i].size = MAX(size, alloc->max_chunk_size);
+            if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
+                alloc->free_blocks[i].size = SIZE_MAX/2;
+            }
+            alloc->free_blocks_begin[alloc->n_chunks + 1] = i + 1;
+            alloc->n_chunks++;
+            best_fit_block = i;
+        } else {
+            // since the last chunk always has virtually endless memory, this should never happen
+            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+                __func__, size, max_avail);
+            GGML_ABORT("graph allocation: failed to reserve memory");
         }
-        best_fit_block = alloc->n_free_blocks - 1;
     }
 
     struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -200,15 +248,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
     block->size -= size;
     if (block->size == 0) {
         // remove block if empty
-        alloc->n_free_blocks--;
-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
-            alloc->free_blocks[j] = alloc->free_blocks[j+1];
-        }
-        // if there are no remaining blocks all memory in current chunk was used up -> start the next one
-        if (alloc->n_free_blocks == 0) {
-            alloc->n_free_blocks = 1;
-            ggml_dyn_tallocr_new_chunk(alloc, &alloc->free_blocks[0], 0);
-        }
+        ggml_dyn_tallocr_remove_block(alloc, best_fit_block);
     }
 
     AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
@@ -255,31 +295,27 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
     size = aligned_offset(NULL, size, alloc->alignment);
 
-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, addr.chunk, addr.offset, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->free_blocks_begin[alloc->n_chunks]);
 
 #ifdef GGML_ALLOCATOR_DEBUG
     remove_allocated_tensor(alloc, addr, tensor);
 #endif
 
+    struct free_block_range blocks = ggml_dyn_tallocr_free_block_range(alloc, addr.chunk);
+
     // see if we can merge with an existing block
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
+    for (int i = blocks.begin; i < blocks.end; i++) {
         struct free_block * block = &alloc->free_blocks[i];
-        // can only merge with blocks within the same chunk
-        if (addr.chunk != block->addr.chunk) {
-            continue;
-        }
         // check if ptr is at the end of the block
         if (block->addr.offset + block->size == addr.offset) {
             block->size += size;
-            // check if we can merge with the next block (within the same chunk)
-            if (i < alloc->n_free_blocks - 1) {
+            // check if we can merge with the next block
+            if (i < blocks.end - 1) {
                 struct free_block * next = &alloc->free_blocks[i+1];
-                if (block->addr.offset + block->size == next->addr.offset && block->addr.chunk == next->addr.chunk) {
+                if (block->addr.offset + block->size == next->addr.offset) {
                     block->size += next->size;
-                    alloc->n_free_blocks--;
-                    for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                    }
+                    ggml_dyn_tallocr_remove_block(alloc, i+1);
                 }
             }
             return;
@@ -288,50 +324,46 @@ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct
         if (addr.offset + size == block->addr.offset) {
             block->addr.offset = addr.offset;
             block->size += size;
-            // check if we can merge with the previous block (within the same chunk)
-            if (i > 0) {
+            // check if we can merge with the previous block
+            if (i > blocks.begin) {
                 struct free_block * prev = &alloc->free_blocks[i-1];
-                if (prev->addr.offset + prev->size == block->addr.offset && prev->addr.chunk == block->addr.chunk) {
+                if (prev->addr.offset + prev->size == block->addr.offset) {
                     prev->size += block->size;
-                    alloc->n_free_blocks--;
-                    for (int j = i; j < alloc->n_free_blocks; j++) {
-                        alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                    }
+                    ggml_dyn_tallocr_remove_block(alloc, i);
                 }
             }
             return;
         }
     }
     // otherwise, add a new block
-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    int n_free_blocks = alloc->free_blocks_begin[alloc->n_chunks];
+    GGML_ASSERT(n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
     // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && ggml_buffer_address_less(alloc->free_blocks[insert_pos].addr, addr)) {
+    int insert_pos = blocks.begin;
+    while (insert_pos < blocks.end && alloc->free_blocks[insert_pos].addr.offset < addr.offset) {
         insert_pos++;
     }
     // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+    for (int i = n_free_blocks; i > insert_pos; i--) {
         alloc->free_blocks[i] = alloc->free_blocks[i-1];
     }
     // insert the new block
     alloc->free_blocks[insert_pos].addr = addr;
     alloc->free_blocks[insert_pos].size = size;
-    alloc->n_free_blocks++;
+    for (int c = addr.chunk + 1; c < alloc->n_chunks + 1; c++) {
+        alloc->free_blocks_begin[c]++;
+    }
 
     GGML_UNUSED(tensor);
 }
 
 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
-    alloc->n_free_blocks = 1;
-    alloc->n_chunks = 1;
-    alloc->free_blocks[0].addr.chunk = 0;
-    alloc->free_blocks[0].addr.offset = 0;
-    alloc->free_blocks[0].size = alloc->max_chunk_size;
-    memset(alloc->max_size, 0, sizeof(alloc->max_size));
-
-    if (alloc->free_blocks[0].size == SIZE_MAX) {
-        alloc->free_blocks[0].size = SIZE_MAX/2; // avoid overflows
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
+        alloc->free_blocks_begin[i] = 0;
+        alloc->max_size[i] = 0;
     }
+    alloc->free_blocks_begin[GGML_VBUFFER_MAX_CHUNKS] = 0;
+    alloc->n_chunks = 0;
 
 #ifdef GGML_ALLOCATOR_DEBUG
     for (int i = 0; i < 1024; i++) {
@@ -344,12 +376,12 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
     struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
 
     *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment       = */ alignment,
-        /*.n_free_blocks   = */ 0,
-        /*.n_chunks        = */ 0,
-        /*.free_blocks     = */ {{{0}, 0}},
-        /*.max_size        = */ {0},
-        /*.max_chunk_size  = */ max_buffer_size,
+        /*.alignment         = */ alignment,
+        /*.n_chunks          = */ 0,
+        /*.free_blocks_begin = */ {0},
+        /*.free_blocks       = */ {{{0}, 0}},
+        /*.max_size          = */ {0},
+        /*.max_chunk_size    = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
 #ifdef GGML_ALLOCATOR_DEBUG
         /*.allocated_tensors = */ {{0}},
 #endif
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp
@@ -16,6 +16,7 @@ uint8_t * const alloc_base = (uint8_t *) 16;
 
 struct dummy_backend_context {
     size_t max_buffer_size = 64;
+    size_t alignment       = 8;
 
     ggml_backend_buffer_i              buffer_interface;
     std::vector<ggml_backend_buffer_t> buffers;
@@ -42,8 +43,9 @@ static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend
     return buffer;
 }
 
-static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t) {
-    return 8;
+static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
+    return ctx->alignment;
 }
 
 static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -88,9 +90,10 @@ struct dummy_backend {
     ggml_backend_buffer_type               buffer_type;
 };
 
-static dummy_backend dummy_backend_init(size_t max_buffer_size) {
+static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
     dummy_backend b{};
     b.context                  = std::make_unique<dummy_backend_context>();
+    b.context->alignment       = alignment;
     b.context->max_buffer_size = max_buffer_size;
 
     b.context->buffer_interface.free_buffer   = dummy_backend_buffer_free_buffer;
@@ -121,7 +124,7 @@ struct test_context_with_graph {
 
 static test_context_with_graph make_context() {
     ggml_init_params params{};
-    params.mem_size = 32 * ggml_tensor_overhead() + ggml_graph_overhead();
+    params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
     params.no_alloc = true;
 
     ggml_context *   ctx     = ggml_init(params);
@@ -319,6 +322,32 @@ static void test_tensor_larger_than_max_size() {
     GGML_ASSERT(backend.context->allocated_total() == 24);
 }
 
+// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
+// require more. Expectation is that the last buffer should grow to fit everything,
+// leaving it to the backend to error out if it can't allocate that much.
+static void test_not_enough_chunks() {
+    const int max_chunks = 16;
+    const int max_size   = 8;
+
+    dummy_backend backend      = dummy_backend_init(max_size);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[max_chunks + 1];
+    for (int i = 0; i < max_chunks + 1; ++i) {
+        x[i] = make_input_with_size(ctx, max_size);
+    }
+    ggml_tensor * acc = x[0];
+    for (int i = 0; i < max_chunks; ++i) {
+        acc = ggml_add(ctx, acc, x[i + 1]);
+    }
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
+}
+
 // Fill up leftover unallocated space of a chunk after allocating a large tensor that
 // requires a new chunk.
 static void test_fill_leftover_space() {
@@ -405,6 +434,24 @@ static void test_merge_free_block(size_t max_buffer_size) {
     GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
 }
 
+// Check that previously allocated but freed memory is preferred over allocating
+// additional memory, even if the remaining space in a chunk would match tensor size better
+static void test_prefer_already_allocated_memory() {
+    dummy_backend backend      = dummy_backend_init(32, /*align*/ 4);
+    auto [ctx, graph, ctx_ptr] = make_context();
+
+    ggml_tensor * x[3];
+    x[0] = make_input_with_size(ctx, 24);  // [24b][8b unused]
+    x[1] = ggml_mean(ctx, x[0]);           // [24b free][4b][4b unused]
+    x[2] = ggml_mean(ctx, x[1]);           // should be allocated in the 24b block
+    assign_names(ctx);
+
+    ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
+    check_all_allocated(graph);
+    check_no_overlap(graph);
+    GGML_ASSERT(backend.context->allocated_total() <= 28);
+}
+
 // test for allocating on multiple devices with some tensors in the graph
 // allocated externally (not by gallocr).
 static void test_multiple_buffer_types() {
@@ -512,11 +559,13 @@ int main() {
     run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
     run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
     run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
+    run("test_not_enough_chunks", test_not_enough_chunks);
     run("test_fill_leftover_space", test_fill_leftover_space);
     run("test_view_inplace", test_view_inplace);
     run("test_reuse_and_free", test_reuse_and_free);
     run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
     run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
+    run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
     run("test_multiple_buffer_types", test_multiple_buffer_types);
     run("test_buffer_size_zero", test_buffer_size_zero);
     return 0;