From 97b96c1ad308eeab4a0bdfd98a952e588799b98b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 6 Sep 2025 11:53:51 +0300
Subject: [PATCH 1/9] metal : make the backend async

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 310 ++++++++++++++++++++++++++-----
 1 file changed, 259 insertions(+), 51 deletions(-)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e76fb712631e1..9626dd3bd5698 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -803,6 +803,12 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
     // n_cb command buffers + 1 used by the main thread
     struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
 
+    // extra command buffers for things like getting, setting and copying tensors
+    NSMutableArray * cmd_bufs_ext;
+
+    id<MTLCommandBuffer> cmd_buf_last;
+    id<MTLCommandBuffer> cmd_buf_ext_last;
+
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
     void *              abort_callback_data;
@@ -1073,6 +1079,11 @@ @implementation GGMLMetalClass
         ctx->cmd_bufs[i].mem_pool->device = device;
     }
 
+    ctx->cmd_bufs_ext = [[NSMutableArray alloc] init];
+
+    ctx->cmd_buf_last = nil;
+    ctx->cmd_buf_ext_last = nil;
+
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
         GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6);
@@ -1666,11 +1677,15 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     [ctx->queue release];
 
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
-        // ctx->cmd_bufs[i].obj is auto released
+        if (ctx->cmd_bufs[i].obj) {
+            [ctx->cmd_bufs[i].obj release];
+        }
 
         ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
     }
 
+    [ctx->cmd_bufs_ext release];
+
     dispatch_release(ctx->d_queue);
 
     free(ctx);
@@ -5778,81 +5793,110 @@ static enum ggml_status ggml_metal_graph_compute(
             }
         }
 
+        // wait for any previous processing
+        if (ctx->cmd_buf_last) {
+            [ctx->cmd_buf_last waitUntilCompleted];
+            ctx->cmd_buf_last = nil;
+        }
+
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+            [cmd_buf retain];
+
+            if (ctx->cmd_bufs[n_cb].obj) {
+                [ctx->cmd_bufs[n_cb].obj release];
+            }
             ctx->cmd_bufs[n_cb].obj = cmd_buf;
 
             [cmd_buf enqueue];
+            ctx->cmd_buf_last = cmd_buf;
+
             ctx->encode_async(n_cb);
         }
 
         // prepare the rest of the command buffers asynchronously
         // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+            [cmd_buf retain];
+
+            if (ctx->cmd_bufs[cb_idx].obj) {
+                [ctx->cmd_bufs[cb_idx].obj release];
+            }
             ctx->cmd_bufs[cb_idx].obj = cmd_buf;
 
             // always enqueue the first two command buffers
             // enqueue all of the command buffers if we don't need to abort
             if (cb_idx < 2 || ctx->abort_callback == NULL) {
                 [cmd_buf enqueue];
+                ctx->cmd_buf_last = cmd_buf;
             }
         }
 
         dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
 
-        // wait for completion and check status of each command buffer
-        // needed to detect if the device ran out-of-memory for example (#1881)
-        {
-            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
-            [cmd_buf waitUntilCompleted];
-
-            MTLCommandBufferStatus status = [cmd_buf status];
-            if (status != MTLCommandBufferStatusCompleted) {
-                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
-                if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
-                }
+        // for debugging: block until graph is computed
+        //[ctx->cmd_buf_last waitUntilCompleted];
 
-                return GGML_STATUS_FAILED;
+        // enter here only when capturing in order to wait for all computation to finish
+        // otherwise, we leave the graph to compute asynchronously
+        if (!should_capture && ctx->capture_started) {
+            // wait for completion and check status of each command buffer
+            // needed to detect if the device ran out-of-memory for example (#1881)
+            {
+                id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
+                [cmd_buf waitUntilCompleted];
+
+                MTLCommandBufferStatus status = [cmd_buf status];
+                if (status != MTLCommandBufferStatusCompleted) {
+                    GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
+                    if (status == MTLCommandBufferStatusError) {
+                        GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                    }
+
+                    return GGML_STATUS_FAILED;
+                }
             }
-        }
 
-        for (int i = 0; i < n_cb; ++i) {
-            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
-            [cmd_buf waitUntilCompleted];
+            for (int i = 0; i < n_cb; ++i) {
+                id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
+                [cmd_buf waitUntilCompleted];
+
+                MTLCommandBufferStatus status = [cmd_buf status];
+                if (status != MTLCommandBufferStatusCompleted) {
+                    GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+                    if (status == MTLCommandBufferStatusError) {
+                        GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                    }
 
-            MTLCommandBufferStatus status = [cmd_buf status];
-            if (status != MTLCommandBufferStatusCompleted) {
-                GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-                if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                    return GGML_STATUS_FAILED;
                 }
 
-                return GGML_STATUS_FAILED;
-            }
+                id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
+                if (!next_buffer) {
+                    continue;
+                }
 
-            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
-            if (!next_buffer) {
-                continue;
-            }
+                const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
+                if (next_queued) {
+                    continue;
+                }
 
-            const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
-            if (next_queued) {
-                continue;
-            }
+                if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
+                    GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
+                    return GGML_STATUS_ABORTED;
+                }
 
-            if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
-                GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i);
-                return GGML_STATUS_ABORTED;
+                [next_buffer commit];
             }
 
-            [next_buffer commit];
-        }
+            if (ctx->cmd_buf_last) {
+                [ctx->cmd_buf_last waitUntilCompleted];
+                ctx->cmd_buf_last = nil;
+            }
 
-        if (!should_capture && ctx->capture_started) {
             [ctx->capture_scope endScope];
             [[MTLCaptureManager sharedCaptureManager] stopCapture];
         }
@@ -6034,7 +6078,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
 }
 
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
+    return false;
 
     GGML_UNUSED(buft);
 }
@@ -6184,6 +6228,154 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
     free(backend);
 }
 
+static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
+    struct ggml_backend_metal_context * ctx = backend->context;
+
+    if (ctx->cmd_buf_last) {
+        [ctx->cmd_buf_last waitUntilCompleted];
+        ctx->cmd_buf_last = nil;
+    }
+
+    if (ctx->cmd_buf_ext_last) {
+        [ctx->cmd_buf_ext_last waitUntilCompleted];
+        ctx->cmd_buf_ext_last = nil;
+    }
+
+    for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
+        id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
+
+        // check status and assert that the command buffer completed successfully
+        MTLCommandBufferStatus status = [cmd_buf status];
+        if (status != MTLCommandBufferStatusCompleted) {
+            GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
+            if (status == MTLCommandBufferStatusError) {
+                GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+            }
+            GGML_ABORT("fatal error");
+        }
+
+        //printf("releasing buffer %d\n", (int) i);
+        [cmd_buf release];
+    }
+    [ctx->cmd_bufs_ext removeAllObjects];
+}
+
+static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
+
+    @autoreleasepool {
+        id<MTLDevice> device = ctx_dev->mtl_device;
+
+        id<MTLBuffer> buf_src = [device newBufferWithBytes:data
+                                                    length:size
+                                                   options:MTLResourceStorageModeShared];
+
+        size_t tensor_offset = (uintptr_t)tensor->data + offset;
+
+        // find which buffer contains this tensor
+        for (int i = 0; i < buf_ctx->n_buffers; i++) {
+            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
+                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
+
+                const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
+
+                id<MTLBuffer> buf_dst = buf_ctx->buffers[i].metal;
+
+                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+                [cmd_buf enqueue];
+
+                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+
+                [encoder copyFromBuffer:buf_src
+                           sourceOffset:0
+                               toBuffer:buf_dst
+                      destinationOffset:buf_dst_offset
+                           size:size];
+
+                [encoder endEncoding];
+                [cmd_buf commit];
+              //[cmd_buf waitUntilCompleted];
+
+                [ctx->cmd_bufs_ext addObject:cmd_buf];
+                ctx->cmd_buf_ext_last = cmd_buf;
+
+                [cmd_buf retain];
+                return;
+            }
+        }
+
+        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+    }
+}
+
+static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size) {
+    struct ggml_backend_metal_context        * ctx     = backend->context;
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
+
+    @autoreleasepool {
+        id<MTLDevice> device = ctx_dev->mtl_device;
+
+        id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
+                                                          length:size
+                                                         options:MTLResourceStorageModeShared
+                                                     deallocator:nil];
+
+        const size_t tensor_offset = (uintptr_t)tensor->data + offset;
+
+        // find which buffer contains this tensor data
+        for (int i = 0; i < buf_ctx->n_buffers; i++) {
+            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
+                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
+
+                const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
+
+                id<MTLBuffer> buf_src = buf_ctx->buffers[i].metal;
+
+                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+                [cmd_buf enqueue];
+
+                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+
+                [encoder copyFromBuffer:buf_src
+                           sourceOffset:buf_src_offset
+                               toBuffer:buf_dst
+                      destinationOffset:0
+                                   size:size];
+
+                [encoder endEncoding];
+                [cmd_buf commit];
+              //[cmd_buf waitUntilCompleted];
+
+                [ctx->cmd_bufs_ext addObject:cmd_buf];
+                ctx->cmd_buf_ext_last = cmd_buf;
+
+                [cmd_buf retain];
+                return;
+            }
+        }
+
+        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+    }
+}
+
+static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    return false;
+
+    GGML_UNUSED(backend_src);
+    GGML_UNUSED(backend_dst);
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+}
+
 static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     return ggml_metal_graph_compute(backend, cgraph);
 }
@@ -6214,7 +6406,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
-        id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
+        id<MTLCommandBuffer>         cmd_buf  = ctx->cmd_bufs[cb_idx].obj;
+        struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
+
+        ggml_metal_mem_pool_reset(mem_pool);
 
         id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
 
@@ -6228,9 +6423,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
-        struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
-        ggml_metal_mem_pool_reset(mem_pool);
-
         for (int idx = node_start; idx < node_end;) {
             if (should_capture) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
@@ -6264,15 +6456,19 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 static struct ggml_backend_i ggml_backend_metal_i = {
     /* .get_name                = */ ggml_backend_metal_name,
     /* .free                    = */ ggml_backend_metal_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
+    /* .set_tensor_async        = */ ggml_backend_metal_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_metal_get_tensor_async,
+    /* .cpy_tensor_async        = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups
+    /* .synchronize             = */ ggml_backend_metal_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
+
+    // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
+    // in any case, these docs seem relevant if we ever decide to implement it:
+    // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
     /* .optimize_graph          = */ NULL,
@@ -6514,8 +6710,20 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
     GGML_UNUSED(dev);
 }
 
+static int64_t get_op_batch_size(const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_MUL_MAT_ID:
+            return op->ne[1];
+        default:
+            return ggml_nrows(op);
+    }
+}
+
 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    return false;
+    const int min_batch_size = 32;
+
+    return get_op_batch_size(op) >= min_batch_size;
+    //return false;
 
     GGML_UNUSED(dev);
     GGML_UNUSED(op);

From c5637cf39cdf3be7ce0bfb98d85a011724069079 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 8 Sep 2025 19:26:18 +0300
Subject: [PATCH 2/9] cont : add comments, extend op offload, clean up

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 84 ++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 26 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9626dd3bd5698..958b14b180fa6 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1684,6 +1684,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
         ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
     }
 
+    [ctx->cmd_bufs_ext removeAllObjects];
     [ctx->cmd_bufs_ext release];
 
     dispatch_release(ctx->d_queue);
@@ -5793,30 +5794,40 @@ static enum ggml_status ggml_metal_graph_compute(
             }
         }
 
-        // wait for any previous processing
-        if (ctx->cmd_buf_last) {
-            [ctx->cmd_buf_last waitUntilCompleted];
-            ctx->cmd_buf_last = nil;
-        }
-
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
-            [cmd_buf retain];
-
+            // first wait for any previous command buffer to be completed
+            // note: this checks only yhat the first part of the previous graph has been computed
+            //       the rest of the graph might still be computing, but it is Ok to start queuing the beginning of the
+            ///      new graph
             if (ctx->cmd_bufs[n_cb].obj) {
+                [ctx->cmd_bufs[n_cb].obj waitUntilCompleted];
                 [ctx->cmd_bufs[n_cb].obj release];
             }
+
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+            [cmd_buf retain];
+
             ctx->cmd_bufs[n_cb].obj = cmd_buf;
 
             [cmd_buf enqueue];
-            ctx->cmd_buf_last = cmd_buf;
 
             ctx->encode_async(n_cb);
         }
 
-        // prepare the rest of the command buffers asynchronously
+        // here we guarantee the full previous graph has finished computing
+        // but note that we have already enqueued the first part of the new graph so it can start processing, while
+        //   continue to encode the rest of the graph
+        if (ctx->cmd_buf_last) {
+            [ctx->cmd_buf_last waitUntilCompleted];
+            ctx->cmd_buf_last = nil;
+        }
+
+        // remember the command buffer for the next iteration
+        ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj;
+
+        // prepare the rest of the command buffers asynchronously (optional)
         // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
             id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
@@ -5831,6 +5842,9 @@ static enum ggml_status ggml_metal_graph_compute(
             // enqueue all of the command buffers if we don't need to abort
             if (cb_idx < 2 || ctx->abort_callback == NULL) {
                 [cmd_buf enqueue];
+
+                // update the pointer to the last queued command buffer
+                // this is needed to implement synchronize()
                 ctx->cmd_buf_last = cmd_buf;
             }
         }
@@ -6078,6 +6092,12 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
 }
 
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly
+    //       to reproduce, do the following:
+    //
+    //   build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON
+    //   run:        ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10
+    //
     return false;
 
     GGML_UNUSED(buft);
@@ -6231,33 +6251,37 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
 static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
     struct ggml_backend_metal_context * ctx = backend->context;
 
+    // wait for the computation of the graph to finish
     if (ctx->cmd_buf_last) {
         [ctx->cmd_buf_last waitUntilCompleted];
         ctx->cmd_buf_last = nil;
     }
 
+    // wait for any pending async get/set operations
     if (ctx->cmd_buf_ext_last) {
         [ctx->cmd_buf_ext_last waitUntilCompleted];
         ctx->cmd_buf_ext_last = nil;
     }
 
-    for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
-        id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
+    // release any completed command buffers
+    if (ctx->cmd_bufs_ext.count > 0) {
+        for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
 
-        // check status and assert that the command buffer completed successfully
-        MTLCommandBufferStatus status = [cmd_buf status];
-        if (status != MTLCommandBufferStatusCompleted) {
-            GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
-            if (status == MTLCommandBufferStatusError) {
-                GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+            MTLCommandBufferStatus status = [cmd_buf status];
+            if (status != MTLCommandBufferStatusCompleted) {
+                GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status);
+                if (status == MTLCommandBufferStatusError) {
+                    GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
+                }
+                GGML_ABORT("fatal error");
             }
-            GGML_ABORT("fatal error");
+
+            [cmd_buf release];
         }
 
-        //printf("releasing buffer %d\n", (int) i);
-        [cmd_buf release];
+        [ctx->cmd_bufs_ext removeAllObjects];
     }
-    [ctx->cmd_bufs_ext removeAllObjects];
 }
 
 static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -6271,13 +6295,14 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
     @autoreleasepool {
         id<MTLDevice> device = ctx_dev->mtl_device;
 
+        // wrap the source data into a Metal buffer
         id<MTLBuffer> buf_src = [device newBufferWithBytes:data
                                                     length:size
                                                    options:MTLResourceStorageModeShared];
 
         size_t tensor_offset = (uintptr_t)tensor->data + offset;
 
-        // find which buffer contains this tensor
+        // find which Metal buffer contains this tensor - we will copy into that buffer
         for (int i = 0; i < buf_ctx->n_buffers; i++) {
             if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
                 tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
@@ -6286,6 +6311,8 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
 
                 id<MTLBuffer> buf_dst = buf_ctx->buffers[i].metal;
 
+                // queue the copy operation into the queue of the Metal context
+                // this will be queued at the end, after any currently ongoing GPU operations
                 id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
                 [cmd_buf enqueue];
 
@@ -6299,8 +6326,11 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
 
                 [encoder endEncoding];
                 [cmd_buf commit];
+
+                // do not wait here for completion
               //[cmd_buf waitUntilCompleted];
 
+                // instead, remember a reference to the command buffer and wait for it later if needed
                 [ctx->cmd_bufs_ext addObject:cmd_buf];
                 ctx->cmd_buf_ext_last = cmd_buf;
 
@@ -6712,6 +6742,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
 
 static int64_t get_op_batch_size(const struct ggml_tensor * op) {
     switch (op->op) {
+        case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
             return op->ne[1];
         default:
@@ -6722,8 +6753,9 @@ static int64_t get_op_batch_size(const struct ggml_tensor * op) {
 static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     const int min_batch_size = 32;
 
-    return get_op_batch_size(op) >= min_batch_size;
-    //return false;
+    return (op->op == GGML_OP_MUL_MAT ||
+            op->op == GGML_OP_MUL_MAT_ID) &&
+            get_op_batch_size(op) >= min_batch_size;
 
     GGML_UNUSED(dev);
     GGML_UNUSED(op);

From bdff7729b1716284ea3234a87d52930cf2fdce80 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 8 Sep 2025 21:01:25 +0300
Subject: [PATCH 3/9] metal : fix batch size for MUL_MAT_ID

---
 ggml/src/ggml-metal/ggml-metal.m | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 958b14b180fa6..e6e4557f2cde4 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6743,8 +6743,9 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
 static int64_t get_op_batch_size(const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
             return op->ne[1];
+        case GGML_OP_MUL_MAT_ID:
+            return op->ne[2];
         default:
             return ggml_nrows(op);
     }

From d91ba85d04928ad13954d7d29b07295f1c65b7d4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 09:28:41 +0300
Subject: [PATCH 4/9] metal : remove deprecated
 ggml_backend_metal_buffer_from_ptr

---
 ggml/include/ggml-metal.h        |  4 --
 ggml/src/ggml-metal/ggml-metal.m | 89 --------------------------------
 2 files changed, 93 deletions(-)

diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index a610694423483..abeea094d0820 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -43,10 +43,6 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_DEPRECATED(
-        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
-
 GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e6e4557f2cde4..75b26820f7c8c 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6143,95 +6143,6 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
     return &ggml_backend_buffer_from_ptr_type_metal;
 }
 
-// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
-ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
-    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
-
-    ctx->all_data = data;
-    ctx->all_size = size;
-    ctx->owned = false;
-    ctx->n_buffers = 0;
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    // page-align the data ptr
-    {
-        const uintptr_t offs = (uintptr_t) data % size_page;
-        data  = (void *) ((char *) data - offs);
-        size += offs;
-    }
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-
-    struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
-
-    GGML_ASSERT(ctx_dev->mtl_device != nil);
-
-    id<MTLDevice> device = ctx_dev->mtl_device;
-
-    // the buffer fits into the max buffer size allowed by the device
-    if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data  = data;
-        ctx->buffers[ctx->n_buffers].size  = size;
-        ctx->buffers[ctx->n_buffers].metal = nil;
-
-        if (size_aligned > 0) {
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-                return false;
-            }
-        }
-
-        ggml_backend_metal_log_allocated_size(device, size_aligned);
-
-        ++ctx->n_buffers;
-    } else {
-        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
-        // one of the views
-        const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-        const size_t size_step = device.maxBufferLength - size_ovlp;
-        const size_t size_view = device.maxBufferLength;
-
-        for (size_t i = 0; i < size; i += size_step) {
-            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-
-            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
-            ctx->buffers[ctx->n_buffers].metal = nil;
-
-            if (size_step_aligned > 0) {
-                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                    return false;
-                }
-            }
-
-            ggml_backend_metal_log_allocated_size(device, size_step_aligned);
-
-            if (i + size_step < size) {
-                GGML_LOG_INFO("\n");
-            }
-
-            ++ctx->n_buffers;
-        }
-    }
-
-    if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
-        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
-        free(ctx);
-        return NULL;
-    }
-
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
-}
-
 // backend
 
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {

From 85aaf52b7ebf4e1e4cbe6ba33793a222b89356c7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 11:45:09 +0300
Subject: [PATCH 5/9] metal : create only metal buffers, no wrapping of host
 memory

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 286 +++++++++++++++++++++----------
 1 file changed, 194 insertions(+), 92 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 75b26820f7c8c..9dab47a987a81 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -44,9 +44,10 @@
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
 static struct ggml_backend_metal_device_context {
-    id<MTLDevice>  mtl_device;
-    int            mtl_device_ref_count;
-    id<MTLLibrary> mtl_library;
+    id<MTLDevice>       mtl_device;
+    int                 mtl_device_ref_count;
+    id<MTLCommandQueue> mtl_queue;
+    id<MTLLibrary>      mtl_library;
 
     NSLock * mtl_lock;
 
@@ -68,6 +69,7 @@
 } g_ggml_ctx_dev_main = {
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
+    /*.mtl_queue               =*/ nil,
     /*.mtl_library             =*/ nil,
     /*.mtl_lock                =*/ nil,
     /*.has_simdgroup_reduction =*/ false,
@@ -94,6 +96,9 @@
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
 
         if (ctx->mtl_device) {
+            ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
+            [ctx->mtl_queue retain];
+
             ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
             ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
@@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
             ctx->mtl_library = nil;
         }
 
+        if (ctx->mtl_queue) {
+            [ctx->mtl_queue release];
+            ctx->mtl_queue = nil;
+        }
+
         if (ctx->mtl_device) {
             [ctx->mtl_device release];
             ctx->mtl_device = nil;
@@ -1005,7 +1015,7 @@ @implementation GGMLMetalClass
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
     ctx->device = device;
-    ctx->queue = [device newCommandQueue];
+    ctx->queue = ctx_dev->mtl_queue;
     if (ctx->queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
@@ -1704,7 +1714,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 struct ggml_backend_metal_buffer_context {
     void * all_data;
     size_t all_size;
-    bool owned;
 
     // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
     int n_buffers;
@@ -1712,6 +1721,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     // optional MTLResidencySet
     id rset;
+
+    id device;
+    id queue;
 };
 
 // rset init
@@ -1777,7 +1789,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
     //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
     const int64_t tsize = ggml_nbytes(t);
@@ -5932,14 +5944,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
     ggml_backend_metal_buffer_rset_free(ctx);
 
-    if (ctx->owned) {
-#if TARGET_OS_OSX
-        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
-#else
-        free(ctx->all_data);
-#endif
-    }
-
     free(ctx);
 }
 
@@ -5950,25 +5954,112 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
 }
 
 static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+#if 1
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    @autoreleasepool {
+        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        [cmd_buf enqueue];
+
+        size_t buf_dst_offset = 0;
+        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
+
+        buf_dst_offset += offset;
+
+        [encoder fillBuffer:buf_dst
+                      range:NSMakeRange(buf_dst_offset, buf_dst_offset + size)
+                      value:value];
+
+        [encoder endEncoding];
+
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+#else
     memset((char *)tensor->data + offset, value, size);
+#endif
 
     GGML_UNUSED(buffer);
 }
 
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+#if 1
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    @autoreleasepool {
+        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        [cmd_buf enqueue];
+
+        // TODO: is this an extra copy? can we avoid it?
+        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
+                                                         length:size
+                                                        options:MTLResourceStorageModeShared];
+
+        size_t buf_dst_offset = 0;
+        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
+
+        buf_dst_offset += offset;
+
+        [encoder copyFromBuffer:buf_src
+                   sourceOffset:0
+                       toBuffer:buf_dst
+              destinationOffset:buf_dst_offset
+                           size:size];
+
+        [encoder endEncoding];
+
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+#else
     memcpy((char *)tensor->data + offset, data, size);
+#endif
 
     GGML_UNUSED(buffer);
 }
 
 static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+#if 1
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    @autoreleasepool {
+        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        [cmd_buf enqueue];
+
+        size_t buf_src_offset = 0;
+        id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
+
+        buf_src_offset += offset;
+
+        id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
+                                                               length:size
+                                                              options:MTLResourceStorageModeShared
+                                                          deallocator:nil];
+
+        [encoder copyFromBuffer:buf_src
+                   sourceOffset:buf_src_offset
+                       toBuffer:buf_dst
+              destinationOffset:0
+                           size:size];
+
+        [encoder endEncoding];
+
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+#else
     memcpy(data, (const char *)tensor->data + offset, size);
+#endif
 
     GGML_UNUSED(buffer);
 }
 
 static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
+        GGML_ASSERT(false && "TODO");
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
     }
@@ -5980,7 +6071,22 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
 static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
-    memset(ctx->all_data, value, ctx->all_size);
+    @autoreleasepool {
+        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        [cmd_buf enqueue];
+
+        [encoder fillBuffer:ctx->buffers[0].metal
+                          range:NSMakeRange(0, ctx->buffers[0].size)
+                          value:value];
+
+        [encoder endEncoding];
+
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+
+    //memset(ctx->all_data, value, ctx->all_size);
 }
 
 static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
@@ -6044,9 +6150,21 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
 
     id<MTLDevice> device = ctx_dev->mtl_device;
 
+#if 1
+    // TODO: tmp hack
+    static void * p_base = (void *) 0x000000400ULL;
+
+    ctx->all_data = p_base;
+
+    p_base = (void *) ((uintptr_t) p_base + size_aligned);
+#else
     ctx->all_data = ggml_metal_host_malloc(size_aligned);
+#endif
     ctx->all_size = size_aligned;
-    ctx->owned = true;
+
+    ctx->device = device;
+    ctx->queue = ctx_dev->mtl_queue;
+
     ctx->n_buffers = 1;
 
     if (ctx->all_data != NULL) {
@@ -6055,10 +6173,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
         ctx->buffers[0].metal = nil;
 
         if (size_aligned > 0) {
+#if 1
+            ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+#else
             ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
                                             length:size_aligned
                                             options:MTLResourceStorageModeShared
                                             deallocator:nil];
+#endif
         }
     }
 
@@ -6092,13 +6214,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
 }
 
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly
-    //       to reproduce, do the following:
-    //
-    //   build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON
-    //   run:        ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10
-    //
-    return false;
+    return true;
 
     GGML_UNUSED(buft);
 }
@@ -6111,7 +6227,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
             /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
             /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
+            /* .is_host          = */ NULL,
         },
         /* .device  = */ &g_ggml_backend_metal_device,
         /* .context = */ NULL,
@@ -6130,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
     static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .alloc_buffer     = */ NULL,
             /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
             /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -6199,10 +6315,6 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
-
     @autoreleasepool {
         id<MTLDevice> device = ctx_dev->mtl_device;
 
@@ -6211,46 +6323,39 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
                                                     length:size
                                                    options:MTLResourceStorageModeShared];
 
-        size_t tensor_offset = (uintptr_t)tensor->data + offset;
-
-        // find which Metal buffer contains this tensor - we will copy into that buffer
-        for (int i = 0; i < buf_ctx->n_buffers; i++) {
-            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
-                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
+        size_t buf_dst_offset = 0;
+        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
 
-                const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
+        if (buf_dst == nil) {
+            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        }
 
-                id<MTLBuffer> buf_dst = buf_ctx->buffers[i].metal;
+        buf_dst_offset += offset;
 
-                // queue the copy operation into the queue of the Metal context
-                // this will be queued at the end, after any currently ongoing GPU operations
-                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
-                [cmd_buf enqueue];
+        // queue the copy operation into the queue of the Metal context
+        // this will be queued at the end, after any currently ongoing GPU operations
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+        [cmd_buf enqueue];
 
-                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
 
-                [encoder copyFromBuffer:buf_src
-                           sourceOffset:0
-                               toBuffer:buf_dst
-                      destinationOffset:buf_dst_offset
+        [encoder copyFromBuffer:buf_src
+                   sourceOffset:0
+                       toBuffer:buf_dst
+              destinationOffset:buf_dst_offset
                            size:size];
 
-                [encoder endEncoding];
-                [cmd_buf commit];
-
-                // do not wait here for completion
-              //[cmd_buf waitUntilCompleted];
+        [encoder endEncoding];
+        [cmd_buf commit];
 
-                // instead, remember a reference to the command buffer and wait for it later if needed
-                [ctx->cmd_bufs_ext addObject:cmd_buf];
-                ctx->cmd_buf_ext_last = cmd_buf;
+        // do not wait here for completion
+        //[cmd_buf waitUntilCompleted];
 
-                [cmd_buf retain];
-                return;
-            }
-        }
+        // instead, remember a reference to the command buffer and wait for it later if needed
+        [ctx->cmd_bufs_ext addObject:cmd_buf];
+        ctx->cmd_buf_ext_last = cmd_buf;
 
-        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        [cmd_buf retain];
     }
 }
 
@@ -6258,10 +6363,6 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
-
     @autoreleasepool {
         id<MTLDevice> device = ctx_dev->mtl_device;
 
@@ -6270,41 +6371,39 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
                                                          options:MTLResourceStorageModeShared
                                                      deallocator:nil];
 
-        const size_t tensor_offset = (uintptr_t)tensor->data + offset;
+        size_t buf_src_offset = 0;
+        id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
 
-        // find which buffer contains this tensor data
-        for (int i = 0; i < buf_ctx->n_buffers; i++) {
-            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
-                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
+        if (buf_src == nil) {
+            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        }
 
-                const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
+        buf_src_offset += offset;
 
-                id<MTLBuffer> buf_src = buf_ctx->buffers[i].metal;
+        // queue the copy operation into the queue of the Metal context
+        // this will be queued at the end, after any currently ongoing GPU operations
+        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+        [cmd_buf enqueue];
 
-                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
-                [cmd_buf enqueue];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
 
-                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+        [encoder copyFromBuffer:buf_src
+                   sourceOffset:buf_src_offset
+                       toBuffer:buf_dst
+              destinationOffset:0
+                           size:size];
 
-                [encoder copyFromBuffer:buf_src
-                           sourceOffset:buf_src_offset
-                               toBuffer:buf_dst
-                      destinationOffset:0
-                                   size:size];
+        [encoder endEncoding];
+        [cmd_buf commit];
 
-                [encoder endEncoding];
-                [cmd_buf commit];
-              //[cmd_buf waitUntilCompleted];
+        // do not wait here for completion
+        //[cmd_buf waitUntilCompleted];
 
-                [ctx->cmd_bufs_ext addObject:cmd_buf];
-                ctx->cmd_buf_ext_last = cmd_buf;
+        // instead, remember a reference to the command buffer and wait for it later if needed
+        [ctx->cmd_bufs_ext addObject:cmd_buf];
+        ctx->cmd_buf_ext_last = cmd_buf;
 
-                [cmd_buf retain];
-                return;
-            }
-        }
-
-        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        [cmd_buf retain];
     }
 }
 
@@ -6513,8 +6612,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g
     props->type        = ggml_backend_metal_device_get_type(dev);
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = (struct ggml_backend_dev_caps) {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ false,
+        /* .async                 = */ true,
+        /* .host_buffer           = */ true,
         /* .buffer_from_host_ptr  = */ true,
         /* .events                = */ false,
     };
@@ -6554,7 +6653,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
 
     ctx->all_data = ptr;
     ctx->all_size = size;
-    ctx->owned = false;
+
     ctx->n_buffers = 0;
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -6577,6 +6676,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
 
     id<MTLDevice> device = ctx_dev->mtl_device;
 
+    ctx->device = device;
+    ctx->queue = ctx_dev->mtl_queue;
+
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
         ctx->buffers[ctx->n_buffers].data  = ptr;

From 7fc2b3d5038544019a32ccc2a55d317dd1ff9adf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 14:28:34 +0300
Subject: [PATCH 6/9] metal : restore .alloc_buffer for buffer_from_ptr_type

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9dab47a987a81..fbaeedc463ff3 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -6246,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
     static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer     = */ NULL,
+            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
             /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes

From f288225d42fbbbc513ff94a777f79bc60938d51e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 14:29:54 +0300
Subject: [PATCH 7/9] metal : remove broken implementation of GGML_OP_SET

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m     | 76 ----------------------------
 ggml/src/ggml-metal/ggml-metal.metal | 32 ------------
 2 files changed, 108 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index fbaeedc463ff3..c6027489a422d 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -477,8 +477,6 @@ - (void) dealloc {
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
     GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
     GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
-    GGML_METAL_KERNEL_TYPE_SET_I32,
-    GGML_METAL_KERNEL_TYPE_SET_F32,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
     GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
     GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
@@ -1411,8 +1409,6 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,             argsort_f32_i32_asc,             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,            argsort_f32_i32_desc,            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,                  leaky_relu_f32,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32,                         set_f32,                         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32,                         set_i32,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,                     cpy_f32_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,                     cpy_f32_f16,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,                    cpy_f32_bf16,                    use_bfloat);
@@ -2012,16 +2008,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                         return false;
                 };
             }
-        case GGML_OP_SET:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_I32:
-                        return true;
-                    default:
-                        return false;
-                };
-            }
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_GET_ROWS:
             {
@@ -5597,68 +5583,6 @@ static int ggml_metal_encode_node(
 
                 [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)];
             } break;
-        case GGML_OP_SET:
-            {
-                GGML_ASSERT(ggml_are_same_shape(src0, dst));
-                GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-                // src0 and dst as viewed during set
-                const size_t dst_nb0 = ggml_element_size(src0);
-
-                const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
-                const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
-                const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
-                const size_t offset  = ((int32_t *) dst->op_params)[3];
-                const bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                if (!inplace) {
-                    memcpy(((char *)  dst->data), ((char *) src0->data), ggml_nbytes(dst));
-                }
-
-                const int im0 = (ne10 == 0 ? 0 : ne10-1);
-                const int im1 = (ne11 == 0 ? 0 : ne11-1);
-                const int im2 = (ne12 == 0 ? 0 : ne12-1);
-                const int im3 = (ne13 == 0 ? 0 : ne13-1);
-
-                GGML_ASSERT(offset + im0*dst_nb0  + im1*dst_nb1  + im2*dst_nb2  + im3*dst_nb3  <= ggml_nbytes(dst));
-
-                id<MTLComputePipelineState> pipeline = nil;
-
-                switch (src0t) {
-                    case GGML_TYPE_F32:
-                        GGML_ASSERT(nb10 == sizeof(float));
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
-                    case GGML_TYPE_I32:
-                        GGML_ASSERT(nb10 == sizeof(int32_t));
-                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
-                    default: GGML_ABORT("fatal error");
-                }
-
-                ggml_metal_kargs_set args = {
-                    /*.ne10    =*/ ne10,
-                    /*.ne11    =*/ ne11,
-                    /*.ne12    =*/ ne12,
-                    /*.nb10    =*/ nb10,
-                    /*.nb11    =*/ nb11,
-                    /*.nb12    =*/ nb12,
-                    /*.nb13    =*/ nb13,
-                    /*.nb1     =*/ dst_nb1,
-                    /*.nb2     =*/ dst_nb2,
-                    /*.nb3     =*/ dst_nb3,
-                    /*.offs    =*/ offset,
-                    /*.inplace =*/ inplace,
-                };
-
-                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
-
-                [encoder setComputePipelineState:pipeline];
-                [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
-
-                [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-            } break;
         case GGML_OP_POOL_2D:
             {
                 GGML_ASSERT(ggml_is_contiguous(src0));
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 77be3c5c9d8be..157d0cc6d0b25 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -5571,38 +5571,6 @@ kernel void kernel_flash_attn_ext_vec_reduce(
 #undef DV
 }
 
-template<typename T>
-kernel void kernel_set(
-    constant ggml_metal_kargs_set & args,
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    uint3   tgpig[[threadgroup_position_in_grid]],
-    ushort3 tpitg[[thread_position_in_threadgroup]],
-    ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i13 = tgpig[2];
-    const int i12 = tgpig[1];
-    const int i11 = tgpig[0];
-
-    const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
-
-    const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
-    const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
-    const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
-
-    device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
-
-    for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
-        device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
-        dst_data[i10] = (T) src[0];
-    }
-}
-
-typedef decltype(kernel_set<float>) kernel_set_t;
-
-template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
-template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
-
 template<typename T0, typename T1>
 kernel void kernel_cpy(
         constant ggml_metal_kargs_cpy & args,

From 0926cb492dba83ca70150c6189d42b8fc97eae49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 14:58:59 +0300
Subject: [PATCH 8/9] metal : clean-up loose ends, ready for tests

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 64 +++++++++++++++++---------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index c6027489a422d..4a67046c97743 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -44,10 +44,9 @@
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
 static struct ggml_backend_metal_device_context {
-    id<MTLDevice>       mtl_device;
-    int                 mtl_device_ref_count;
-    id<MTLCommandQueue> mtl_queue;
-    id<MTLLibrary>      mtl_library;
+    id<MTLDevice>  mtl_device;
+    int            mtl_device_ref_count;
+    id<MTLLibrary> mtl_library;
 
     NSLock * mtl_lock;
 
@@ -69,7 +68,6 @@
 } g_ggml_ctx_dev_main = {
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
-    /*.mtl_queue               =*/ nil,
     /*.mtl_library             =*/ nil,
     /*.mtl_lock                =*/ nil,
     /*.has_simdgroup_reduction =*/ false,
@@ -96,9 +94,6 @@
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
 
         if (ctx->mtl_device) {
-            ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
-            [ctx->mtl_queue retain];
-
             ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
             ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
@@ -166,11 +161,6 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
             ctx->mtl_library = nil;
         }
 
-        if (ctx->mtl_queue) {
-            [ctx->mtl_queue release];
-            ctx->mtl_queue = nil;
-        }
-
         if (ctx->mtl_device) {
             [ctx->mtl_device release];
             ctx->mtl_device = nil;
@@ -1013,7 +1003,7 @@ @implementation GGMLMetalClass
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
     ctx->device = device;
-    ctx->queue = ctx_dev->mtl_queue;
+    ctx->queue = [device newCommandQueue];
     if (ctx->queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
@@ -1719,7 +1709,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     id rset;
 
     id device;
-    id queue;
 };
 
 // rset init
@@ -5882,7 +5871,8 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
 
@@ -5899,6 +5889,10 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer
 
         [cmd_buf commit];
         [cmd_buf waitUntilCompleted];
+
+        // note: not sure why this release is necessary as we are inside an autoreleasepool block
+        //       but without it, we get "Context leak detected" warnings
+        [queue release];
     }
 #else
     memset((char *)tensor->data + offset, value, size);
@@ -5912,7 +5906,8 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
 
@@ -5936,6 +5931,8 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s
 
         [cmd_buf commit];
         [cmd_buf waitUntilCompleted];
+
+        [queue release];
     }
 #else
     memcpy((char *)tensor->data + offset, data, size);
@@ -5949,7 +5946,8 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
 
@@ -5973,6 +5971,8 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
 
         [cmd_buf commit];
         [cmd_buf waitUntilCompleted];
+
+        [queue release];
     }
 #else
     memcpy(data, (const char *)tensor->data + offset, size);
@@ -5993,10 +5993,12 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
 }
 
 static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+#if 1
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
 
@@ -6008,9 +6010,12 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
 
         [cmd_buf commit];
         [cmd_buf waitUntilCompleted];
-    }
 
-    //memset(ctx->all_data, value, ctx->all_size);
+        [queue release];
+    }
+#else
+    memset(ctx->all_data, value, ctx->all_size);
+#endif
 }
 
 static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
@@ -6075,30 +6080,26 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     id<MTLDevice> device = ctx_dev->mtl_device;
 
 #if 1
-    // TODO: tmp hack
-    static void * p_base = (void *) 0x000000400ULL;
-
-    ctx->all_data = p_base;
-
-    p_base = (void *) ((uintptr_t) p_base + size_aligned);
+    // we'll populate this after creating the Metal buffer below
+    ctx->all_data = (void *) 0x000000400ULL;
 #else
     ctx->all_data = ggml_metal_host_malloc(size_aligned);
 #endif
     ctx->all_size = size_aligned;
 
     ctx->device = device;
-    ctx->queue = ctx_dev->mtl_queue;
 
     ctx->n_buffers = 1;
 
     if (ctx->all_data != NULL) {
-        ctx->buffers[0].data  = ctx->all_data;
         ctx->buffers[0].size  = size;
         ctx->buffers[0].metal = nil;
 
         if (size_aligned > 0) {
 #if 1
             ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+
+            ctx->all_data = (void *) (ctx->buffers[0].metal.gpuAddress);
 #else
             ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
                                             length:size_aligned
@@ -6106,6 +6107,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
                                             deallocator:nil];
 #endif
         }
+
+        ctx->buffers[0].data = ctx->all_data;
     }
 
     if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
@@ -6167,6 +6170,8 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 }
 
 static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
+    // note: not obvious, but this buffer type still needs to implement .alloc_buffer:
+    //       https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
     static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
@@ -6601,7 +6606,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
     id<MTLDevice> device = ctx_dev->mtl_device;
 
     ctx->device = device;
-    ctx->queue = ctx_dev->mtl_queue;
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {

From 3f62ee8bee97deae630c2e0b58b550bcf7fa69cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 Sep 2025 17:06:46 +0300
Subject: [PATCH 9/9] metal : back to a single queue per device

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.m | 51 ++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 4a67046c97743..5cd8893eede63 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -48,6 +48,8 @@
     int            mtl_device_ref_count;
     id<MTLLibrary> mtl_library;
 
+    id<MTLCommandQueue> mtl_queue;
+
     NSLock * mtl_lock;
 
     bool has_simdgroup_reduction;
@@ -69,6 +71,7 @@
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
     /*.mtl_library             =*/ nil,
+    /*.mtl_queue               =*/ nil,
     /*.mtl_lock                =*/ nil,
     /*.has_simdgroup_reduction =*/ false,
     /*.has_simdgroup_mm        =*/ false,
@@ -94,6 +97,8 @@
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
 
         if (ctx->mtl_device) {
+            ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
+
             ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
             ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
@@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
             ctx->mtl_library = nil;
         }
 
+        if (ctx->mtl_queue) {
+            [ctx->mtl_queue release];
+            ctx->mtl_queue = nil;
+        }
+
         if (ctx->mtl_device) {
             [ctx->mtl_device release];
             ctx->mtl_device = nil;
@@ -1003,7 +1013,7 @@ @implementation GGMLMetalClass
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
     ctx->device = device;
-    ctx->queue = [device newCommandQueue];
+    ctx->queue = ctx_dev->mtl_queue;
     if (ctx->queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
@@ -1670,8 +1680,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     Block_release(ctx->encode_async);
 
-    [ctx->queue release];
-
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
         if (ctx->cmd_bufs[i].obj) {
             [ctx->cmd_bufs[i].obj release];
@@ -1709,6 +1717,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     id rset;
 
     id device;
+    id queue;
 };
 
 // rset init
@@ -5776,6 +5785,8 @@ static enum ggml_status ggml_metal_graph_compute(
 
         dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async);
 
+        [ctx->cmd_buf_last waitUntilScheduled];
+
         // for debugging: block until graph is computed
         //[ctx->cmd_buf_last waitUntilCompleted];
 
@@ -5871,7 +5882,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandQueue>       queue   = ctx->queue;
         id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
@@ -5888,11 +5899,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer
         [encoder endEncoding];
 
         [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
-
-        // note: not sure why this release is necessary as we are inside an autoreleasepool block
-        //       but without it, we get "Context leak detected" warnings
-        [queue release];
+        [cmd_buf waitUntilScheduled];
     }
 #else
     memset((char *)tensor->data + offset, value, size);
@@ -5906,15 +5913,16 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandQueue>       queue   = ctx->queue;
         id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
 
         // TODO: is this an extra copy? can we avoid it?
-        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
-                                                         length:size
-                                                        options:MTLResourceStorageModeShared];
+        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytesNoCopy:data
+                                                               length:size
+                                                              options:MTLResourceStorageModeShared
+                                                          deallocator:nil];
 
         size_t buf_dst_offset = 0;
         id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
@@ -5929,10 +5937,9 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s
 
         [encoder endEncoding];
 
+        // note: no need to wait for completion here
         [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
-
-        [queue release];
+        [cmd_buf waitUntilScheduled];
     }
 #else
     memcpy((char *)tensor->data + offset, data, size);
@@ -5946,7 +5953,7 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandQueue>       queue   = ctx->queue;
         id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
@@ -5971,8 +5978,6 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
 
         [cmd_buf commit];
         [cmd_buf waitUntilCompleted];
-
-        [queue release];
     }
 #else
     memcpy(data, (const char *)tensor->data + offset, size);
@@ -5997,7 +6002,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     @autoreleasepool {
-        id<MTLCommandQueue>       queue   = [ctx->device newCommandQueue];
+        id<MTLCommandQueue>       queue   = ctx->queue;
         id<MTLCommandBuffer>      cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [cmd_buf enqueue];
@@ -6009,9 +6014,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
         [encoder endEncoding];
 
         [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
-
-        [queue release];
+        [cmd_buf waitUntilScheduled];
     }
 #else
     memset(ctx->all_data, value, ctx->all_size);
@@ -6088,6 +6091,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     ctx->all_size = size_aligned;
 
     ctx->device = device;
+    ctx->queue = ctx_dev->mtl_queue;
 
     ctx->n_buffers = 1;
 
@@ -6606,6 +6610,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
     id<MTLDevice> device = ctx_dev->mtl_device;
 
     ctx->device = device;
+    ctx->queue = ctx_dev->mtl_queue;
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {