metal : remove mem pool implementation

ggerganov · ggerganov · commit 594834661cae · 2025-09-13T17:26:37.000+03:00
ggml-ci
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -622,171 +622,9 @@ @interface ggml_metal_heap_ptr : NSObject
 @implementation ggml_metal_heap_ptr
 @end
 
-//
-// ggml_metal_mem_pool [TAG_MEM_POOL_REMOVE]
-//
-
-struct ggml_metal_mem_pool {
-    id<MTLDevice> device;
-
-    int n_heaps; // total number of heaps ever created (including those that were removed)
-
-    NSMutableArray * heaps;
-    NSMutableArray * heaps_to_remove;
-};
-
-static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
-    struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
-
-    mem_pool->n_heaps = 0;
-
-    mem_pool->heaps           = [[NSMutableArray alloc] init];
-    mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
-
-    return mem_pool;
-}
-
-static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
-    GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps);
-
-    size_t size_all = 0;
-    size_t size_cur = 0;
-
-    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
-        GGML_LOG_DEBUG("%s:   heap: %p\n",                __func__, (void *) ptr.data);
-        GGML_LOG_DEBUG("%s:     n_alloc:  %" PRId64 "\n", __func__, ptr.data->n_alloc);
-        GGML_LOG_DEBUG("%s:     n_unused: %d\n",          __func__, ptr.data->n_unused);
-        GGML_LOG_DEBUG("%s:     size:     %.2f MiB\n",    __func__, [ptr.data->obj size] / 1024.0 / 1024.0);
-        GGML_LOG_DEBUG("%s:     bufs:     %zu\n",         __func__, [ptr.data->bufs count]);
-
-        if ([ptr.data->bufs count] > 0) {
-            size_cur += [ptr.data->obj size];
-        }
-        size_all += [ptr.data->obj size];
-
-        ggml_metal_heap_free(ptr.data);
-        [ptr release];
-    }
-    [mem_pool->heaps           release];
-    [mem_pool->heaps_to_remove release];
-
-    if (size_all > 0) {
-        GGML_LOG_DEBUG("%s:   size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0);
-        GGML_LOG_DEBUG("%s:   size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0);
-    }
-
-    free(mem_pool);
-}
-
-static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
-    for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) {
-        ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i];
-
-        struct ggml_metal_heap * heap = ptr.data;
-        ggml_metal_heap_reset(heap);
-
-        // if the heap hasn't been used for a while, remove it
-        if (heap->n_unused >= 128) {
-            [mem_pool->heaps_to_remove addObject:@(i)];
-        }
-    }
-
-    if (mem_pool->heaps_to_remove.count > 0) {
-        // remove in reverse order
-        for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) {
-            NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
-            ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
-
-            struct ggml_metal_heap * heap = ptr.data;
-            ggml_metal_heap_free(heap);
-
-            [mem_pool->heaps removeObjectAtIndex:index];
-            [ptr release];
-
-            if (i == 0) {
-                break;
-            }
-        }
-
-        [mem_pool->heaps_to_remove removeAllObjects];
-    }
-}
-
-static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
-    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
-        ptr.data->offs = 0;
-    }
-}
-
-static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
-    const size_t alignment = 256;
-
-    const size_t size_aligned = GGML_PAD(size, alignment);
-
-    // try one of the existing heaps
-    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
-        struct ggml_metal_heap * heap = ptr.data;
-        if (heap->offs + size_aligned <= [heap->obj size]) {
-            // if this is the first buffer in the heap for the current command buffer, tell the OS that
-            //   it cannot free the memory used by the heap
-            // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
-            if ([heap->bufs count] == 0) {
-                [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
-            }
-
-            id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
-            if (buf == nil) {
-                GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
-                return nil;
-            }
-
-            heap->n_alloc++;
-            heap->offs += size_aligned;
-
-            [heap->bufs addObject:buf];
-
-            return buf;
-        }
-    }
-
-    // create a new heap that can fit this buffer
-    ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
-
-    struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
-    if (heap == NULL) {
-        GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned);
-        return NULL;
-    }
-
-    //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
-
-    heap_ptr.data = heap;
-    ggml_metal_heap_reset(heap);
-
-    [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
-    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
-    if (buf == nil) {
-        GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
-        return NULL;
-    }
-
-    heap->n_alloc++;
-    heap->offs += size_aligned;
-
-    [heap->bufs addObject:buf];
-
-    [mem_pool->heaps addObject:heap_ptr];
-    mem_pool->n_heaps++;
-
-    return buf;
-}
-
 struct ggml_metal_command_buffer {
     id<MTLCommandBuffer> obj;
 
-    // each command buffer has a memory pool from which it can allocate temporary buffers during the compute
-    struct ggml_metal_mem_pool * mem_pool;
-
     // used to enable concurrent execution of ops in the command buffers
     struct ggml_mem_ranges * mem_ranges;
 };
@@ -1103,9 +941,6 @@ @implementation GGMLMetalClass
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
         ctx->cmd_bufs[i].obj = nil;
 
-        ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
-        ctx->cmd_bufs[i].mem_pool->device = device;
-
         if (ctx_dev->use_concurrency) {
             ctx->cmd_bufs[i].mem_ranges = ggml_mem_ranges_init(ctx_dev->debug_graph);
         }
@@ -1760,8 +1595,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
             [ctx->cmd_bufs[i].obj release];
         }
 
-        ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
-
         if (ctx->cmd_bufs[i].mem_ranges) {
             ggml_mem_ranges_free(ctx->cmd_bufs[i].mem_ranges);
         }
@@ -2126,8 +1959,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
 
     id<MTLComputeCommandEncoder> encoder;
 
-    struct ggml_metal_mem_pool * mem_pool;
-
     struct ggml_mem_ranges * mem_ranges;
 };
 
@@ -2164,8 +1995,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
 
     id<MTLComputeCommandEncoder> encoder = ctx_enc->encoder;
 
-    struct ggml_metal_mem_pool * mem_pool = ctx_enc->mem_pool;
-
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -2206,8 +2035,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
         GGML_ABORT("unsupported op");
     }
 
-    ggml_metal_mem_pool_clear(mem_pool);
-
     const int64_t  ne00 = src0 ? src0->ne[0] : 0;
     const int64_t  ne01 = src0 ? src0->ne[1] : 0;
     const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@@ -5819,12 +5646,7 @@ static enum ggml_status ggml_metal_graph_compute(
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
-            // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed
-            // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences
-            //       https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009
-            // [TAG_MEM_POOL_REMOVE]
-            //id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
             [cmd_buf retain];
 
             if (ctx->cmd_bufs[n_cb].obj) {
@@ -5843,8 +5665,7 @@ static enum ggml_status ggml_metal_graph_compute(
         // prepare the rest of the command buffers asynchronously (optional)
         // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            //id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
             [cmd_buf retain];
 
             if (ctx->cmd_bufs[cb_idx].obj) {
@@ -6674,11 +6495,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
         id<MTLCommandBuffer>         cmd_buf    = ctx->cmd_bufs[cb_idx].obj;
-        struct ggml_metal_mem_pool * mem_pool   = ctx->cmd_bufs[cb_idx].mem_pool;
         struct ggml_mem_ranges     * mem_ranges = ctx->cmd_bufs[cb_idx].mem_ranges;
 
-        ggml_metal_mem_pool_reset(mem_pool);
-
         if (mem_ranges) {
             ggml_mem_ranges_reset(mem_ranges);
         }
@@ -6706,7 +6524,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
         struct ggml_metal_encode_context ctx_enc = {
             /*.backend    =*/ backend,
             /*.encoder    =*/ encoder,
-            /*.mem_pool   =*/ mem_pool,
             /*.mem_ranges =*/ mem_ranges,
         };