From 97b96c1ad308eeab4a0bdfd98a952e588799b98b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Sep 2025 11:53:51 +0300 Subject: [PATCH 1/9] metal : make the backend async ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 310 ++++++++++++++++++++++++++----- 1 file changed, 259 insertions(+), 51 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e76fb712631e1..9626dd3bd5698 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -803,6 +803,12 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { // n_cb command buffers + 1 used by the main thread struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + // extra command buffers for things like getting, setting and copying tensors + NSMutableArray * cmd_bufs_ext; + + id cmd_buf_last; + id cmd_buf_ext_last; + // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; void * abort_callback_data; @@ -1073,6 +1079,11 @@ @implementation GGMLMetalClass ctx->cmd_bufs[i].mem_pool->device = device; } + ctx->cmd_bufs_ext = [[NSMutableArray alloc] init]; + + ctx->cmd_buf_last = nil; + ctx->cmd_buf_ext_last = nil; + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6); @@ -1666,11 +1677,15 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - // ctx->cmd_bufs[i].obj is auto released + if (ctx->cmd_bufs[i].obj) { + [ctx->cmd_bufs[i].obj release]; + } ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } + [ctx->cmd_bufs_ext release]; + dispatch_release(ctx->d_queue); free(ctx); @@ -5778,81 +5793,110 @@ static enum ggml_status ggml_metal_graph_compute( } } + // wait for any previous processing + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + + if (ctx->cmd_bufs[n_cb].obj) { + [ctx->cmd_bufs[n_cb].obj release]; + } ctx->cmd_bufs[n_cb].obj = cmd_buf; [cmd_buf enqueue]; + ctx->cmd_buf_last = cmd_buf; + ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + + if (ctx->cmd_bufs[cb_idx].obj) { + [ctx->cmd_bufs[cb_idx].obj release]; + } ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf enqueue]; + ctx->cmd_buf_last = cmd_buf; } } dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); - // wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) - { - id cmd_buf = ctx->cmd_bufs[n_cb].obj; - [cmd_buf waitUntilCompleted]; - - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); - } + // for debugging: block until graph is computed + //[ctx->cmd_buf_last waitUntilCompleted]; - return GGML_STATUS_FAILED; + // enter here only when capturing in order to wait for all computation to finish + // otherwise, we leave the graph to compute asynchronously + if (!should_capture && ctx->capture_started) { + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } } - } - for (int i = 0; i < n_cb; ++i) { - id cmd_buf = ctx->cmd_bufs[i].obj; - [cmd_buf waitUntilCompleted]; + for (int i = 0; i < n_cb; ++i) { + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + return GGML_STATUS_FAILED; } - return GGML_STATUS_FAILED; - } + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); + if (!next_buffer) { + continue; + } - id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); - if (!next_buffer) { - continue; - } + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } - const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; - } + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; + [next_buffer commit]; } - [next_buffer commit]; - } + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } - if (!should_capture && ctx->capture_started) { [ctx->capture_scope endScope]; [[MTLCaptureManager sharedCaptureManager] stopCapture]; } @@ -6034,7 +6078,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; + return false; GGML_UNUSED(buft); } @@ -6184,6 +6228,154 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + struct ggml_backend_metal_context * ctx = backend->context; + + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + + if (ctx->cmd_buf_ext_last) { + [ctx->cmd_buf_ext_last waitUntilCompleted]; + ctx->cmd_buf_ext_last = nil; + } + + for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) { + id cmd_buf = ctx->cmd_bufs_ext[i]; + + // check status and assert that the command buffer completed successfully + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + GGML_ABORT("fatal error"); + } + + //printf("releasing buffer %d\n", (int) i); + [cmd_buf release]; + } + [ctx->cmd_bufs_ext removeAllObjects]; +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + id buf_src = [device newBufferWithBytes:data + length:size + options:MTLResourceStorageModeShared]; + + size_t tensor_offset = (uintptr_t)tensor->data + offset; + + // find which buffer contains this tensor + for (int i = 0; i < buf_ctx->n_buffers; i++) { + if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && + tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { + + const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; + + id buf_dst = buf_ctx->buffers[i].metal; + + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; + + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + //[cmd_buf waitUntilCompleted]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; + + [cmd_buf retain]; + return; + } + } + + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + id buf_dst = [device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + const size_t tensor_offset = (uintptr_t)tensor->data + offset; + + // find which buffer contains this tensor data + for (int i = 0; i < buf_ctx->n_buffers; i++) { + if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && + tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { + + const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; + + id buf_src = buf_ctx->buffers[i].metal; + + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; + + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + //[cmd_buf waitUntilCompleted]; + + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; + + [cmd_buf retain]; + return; + } + } + + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } +} + +static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + return false; + + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); +} + static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return ggml_metal_graph_compute(backend, cgraph); } @@ -6214,7 +6406,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + + ggml_metal_mem_pool_reset(mem_pool); id encoder = [cmd_buf computeCommandEncoder]; @@ -6228,9 +6423,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; - struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; - ggml_metal_mem_pool_reset(mem_pool); - for (int idx = node_start; idx < node_end;) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; @@ -6264,15 +6456,19 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups + /* .synchronize = */ ggml_backend_metal_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, + + // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal + // in any case, these docs seem relevant if we ever decide to implement it: + // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events /* .event_record = */ NULL, /* .event_wait = */ NULL, /* .optimize_graph = */ NULL, @@ -6514,8 +6710,20 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml GGML_UNUSED(dev); } +static int64_t get_op_batch_size(const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT_ID: + return op->ne[1]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - return false; + const int min_batch_size = 32; + + return get_op_batch_size(op) >= min_batch_size; + //return false; GGML_UNUSED(dev); GGML_UNUSED(op); From c5637cf39cdf3be7ce0bfb98d85a011724069079 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 Sep 2025 19:26:18 +0300 Subject: [PATCH 2/9] cont : add comments, extend op offload, clean up ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 84 ++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 9626dd3bd5698..958b14b180fa6 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1684,6 +1684,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } + [ctx->cmd_bufs_ext removeAllObjects]; [ctx->cmd_bufs_ext release]; dispatch_release(ctx->d_queue); @@ -5793,30 +5794,40 @@ static enum ggml_status ggml_metal_graph_compute( } } - // wait for any previous processing - if (ctx->cmd_buf_last) { - [ctx->cmd_buf_last waitUntilCompleted]; - ctx->cmd_buf_last = nil; - } - // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - id cmd_buf = [ctx->queue commandBuffer]; - [cmd_buf retain]; - + // first wait for any previous command buffer to be completed + // note: this checks only yhat the first part of the previous graph has been computed + // the rest of the graph might still be computing, but it is Ok to start queuing the beginning of the + /// new graph if (ctx->cmd_bufs[n_cb].obj) { + [ctx->cmd_bufs[n_cb].obj waitUntilCompleted]; [ctx->cmd_bufs[n_cb].obj release]; } + + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; [cmd_buf enqueue]; - ctx->cmd_buf_last = cmd_buf; ctx->encode_async(n_cb); } - // prepare the rest of the command buffers asynchronously + // here we guarantee the full previous graph has finished computing + // but note that we have already enqueued the first part of the new graph so it can start processing, while + // continue to encode the rest of the graph + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + + // remember the command buffer for the next iteration + ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj; + + // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { id cmd_buf = [ctx->queue commandBuffer]; @@ -5831,6 +5842,9 @@ static enum ggml_status ggml_metal_graph_compute( // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf enqueue]; + + // update the pointer to the last queued command buffer + // this is needed to implement synchronize() ctx->cmd_buf_last = cmd_buf; } } @@ -6078,6 +6092,12 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly + // to reproduce, do the following: + // + // build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON + // run: ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10 + // return false; GGML_UNUSED(buft); @@ -6231,33 +6251,37 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { static void ggml_backend_metal_synchronize(ggml_backend_t backend) { struct ggml_backend_metal_context * ctx = backend->context; + // wait for the computation of the graph to finish if (ctx->cmd_buf_last) { [ctx->cmd_buf_last waitUntilCompleted]; ctx->cmd_buf_last = nil; } + // wait for any pending async get/set operations if (ctx->cmd_buf_ext_last) { [ctx->cmd_buf_ext_last waitUntilCompleted]; ctx->cmd_buf_ext_last = nil; } - for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) { - id cmd_buf = ctx->cmd_bufs_ext[i]; + // release any completed command buffers + if (ctx->cmd_bufs_ext.count > 0) { + for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) { + id cmd_buf = ctx->cmd_bufs_ext[i]; - // check status and assert that the command buffer completed successfully - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + GGML_ABORT("fatal error"); } - GGML_ABORT("fatal error"); + + [cmd_buf release]; } - //printf("releasing buffer %d\n", (int) i); - [cmd_buf release]; + [ctx->cmd_bufs_ext removeAllObjects]; } - [ctx->cmd_bufs_ext removeAllObjects]; } static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -6271,13 +6295,14 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st @autoreleasepool { id device = ctx_dev->mtl_device; + // wrap the source data into a Metal buffer id buf_src = [device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; size_t tensor_offset = (uintptr_t)tensor->data + offset; - // find which buffer contains this tensor + // find which Metal buffer contains this tensor - we will copy into that buffer for (int i = 0; i < buf_ctx->n_buffers; i++) { if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { @@ -6286,6 +6311,8 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st id buf_dst = buf_ctx->buffers[i].metal; + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations id cmd_buf = [ctx->queue commandBuffer]; [cmd_buf enqueue]; @@ -6299,8 +6326,11 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st [encoder endEncoding]; [cmd_buf commit]; + + // do not wait here for completion //[cmd_buf waitUntilCompleted]; + // instead, remember a reference to the command buffer and wait for it later if needed [ctx->cmd_bufs_ext addObject:cmd_buf]; ctx->cmd_buf_ext_last = cmd_buf; @@ -6712,6 +6742,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml static int64_t get_op_batch_size(const struct ggml_tensor * op) { switch (op->op) { + case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: return op->ne[1]; default: @@ -6722,8 +6753,9 @@ static int64_t get_op_batch_size(const struct ggml_tensor * op) { static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { const int min_batch_size = 32; - return get_op_batch_size(op) >= min_batch_size; - //return false; + return (op->op == GGML_OP_MUL_MAT || + op->op == GGML_OP_MUL_MAT_ID) && + get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); GGML_UNUSED(op); From bdff7729b1716284ea3234a87d52930cf2fdce80 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 Sep 2025 21:01:25 +0300 Subject: [PATCH 3/9] metal : fix batch size for MUL_MAT_ID --- ggml/src/ggml-metal/ggml-metal.m | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 958b14b180fa6..e6e4557f2cde4 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -6743,8 +6743,9 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml static int64_t get_op_batch_size(const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + return op->ne[2]; default: return ggml_nrows(op); } From d91ba85d04928ad13954d7d29b07295f1c65b7d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 09:28:41 +0300 Subject: [PATCH 4/9] metal : remove deprecated ggml_backend_metal_buffer_from_ptr --- ggml/include/ggml-metal.h | 4 -- ggml/src/ggml-metal/ggml-metal.m | 89 -------------------------------- 2 files changed, 93 deletions(-) diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index a610694423483..abeea094d0820 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -43,10 +43,6 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_DEPRECATED( - GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), - "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713"); - GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e6e4557f2cde4..75b26820f7c8c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -6143,95 +6143,6 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) return &ggml_backend_buffer_from_ptr_type_metal; } -// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr -ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { - struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); - - ctx->all_data = data; - ctx->all_size = size; - ctx->owned = false; - ctx->n_buffers = 0; - - const size_t size_page = sysconf(_SC_PAGESIZE); - - // page-align the data ptr - { - const uintptr_t offs = (uintptr_t) data % size_page; - data = (void *) ((char *) data - offs); - size += offs; - } - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; - - GGML_ASSERT(ctx_dev->mtl_device != nil); - - id device = ctx_dev->mtl_device; - - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].metal = nil; - - if (size_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; - } - } - - ggml_backend_metal_log_allocated_size(device, size_aligned); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = device.maxBufferLength - size_ovlp; - const size_t size_view = device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - ctx->buffers[ctx->n_buffers].metal = nil; - - if (size_step_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; - } - } - - ggml_backend_metal_log_allocated_size(device, size_step_aligned); - - if (i + size_step < size) { - GGML_LOG_INFO("\n"); - } - - ++ctx->n_buffers; - } - } - - if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { - GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); - free(ctx); - return NULL; - } - - return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); -} - // backend static const char * ggml_backend_metal_name(ggml_backend_t backend) { From 85aaf52b7ebf4e1e4cbe6ba33793a222b89356c7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 11:45:09 +0300 Subject: [PATCH 5/9] metal : create only metal buffers, no wrapping of host memory ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 286 +++++++++++++++++++++---------- 1 file changed, 194 insertions(+), 92 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 75b26820f7c8c..9dab47a987a81 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,9 +44,10 @@ // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; - id mtl_library; + id mtl_device; + int mtl_device_ref_count; + id mtl_queue; + id mtl_library; NSLock * mtl_lock; @@ -68,6 +69,7 @@ } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, + /*.mtl_queue =*/ nil, /*.mtl_library =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, @@ -94,6 +96,9 @@ ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + [ctx->mtl_queue retain]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -1005,7 +1015,7 @@ @implementation GGMLMetalClass GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1704,7 +1714,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { struct ggml_backend_metal_buffer_context { void * all_data; size_t all_size; - bool owned; // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; @@ -1712,6 +1721,9 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { // optional MTLResidencySet id rset; + + id device; + id queue; }; // rset init @@ -1777,7 +1789,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { +static id ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) { //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -5932,14 +5944,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) ggml_backend_metal_buffer_rset_free(ctx); - if (ctx->owned) { -#if TARGET_OS_OSX - vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); -#else - free(ctx->all_data); -#endif - } - free(ctx); } @@ -5950,25 +5954,112 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + [encoder fillBuffer:buf_dst + range:NSMakeRange(buf_dst_offset, buf_dst_offset + size) + value:value]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memset((char *)tensor->data + offset, value, size); +#endif GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + // TODO: is this an extra copy? can we avoid it? + id buf_src = [ctx->device newBufferWithBytes:data + length:size + options:MTLResourceStorageModeShared]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memcpy((char *)tensor->data + offset, data, size); +#endif GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + buf_src_offset += offset; + + id buf_dst = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memcpy(data, (const char *)tensor->data + offset, size); +#endif GGML_UNUSED(buffer); } static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { + GGML_ASSERT(false && "TODO"); memcpy(dst->data, src->data, ggml_nbytes(src)); return true; } @@ -5980,7 +6071,22 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - memset(ctx->all_data, value, ctx->all_size); + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + [encoder fillBuffer:ctx->buffers[0].metal + range:NSMakeRange(0, ctx->buffers[0].size) + value:value]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } + + //memset(ctx->all_data, value, ctx->all_size); } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { @@ -6044,9 +6150,21 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba id device = ctx_dev->mtl_device; +#if 1 + // TODO: tmp hack + static void * p_base = (void *) 0x000000400ULL; + + ctx->all_data = p_base; + + p_base = (void *) ((uintptr_t) p_base + size_aligned); +#else ctx->all_data = ggml_metal_host_malloc(size_aligned); +#endif ctx->all_size = size_aligned; - ctx->owned = true; + + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + ctx->n_buffers = 1; if (ctx->all_data != NULL) { @@ -6055,10 +6173,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ctx->buffers[0].metal = nil; if (size_aligned > 0) { +#if 1 + ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; +#else ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; +#endif } } @@ -6092,13 +6214,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly - // to reproduce, do the following: - // - // build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON - // run: ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10 - // - return false; + return true; GGML_UNUSED(buft); } @@ -6111,7 +6227,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ NULL, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, @@ -6130,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, + /* .alloc_buffer = */ NULL, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -6199,10 +6315,6 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; - @autoreleasepool { id device = ctx_dev->mtl_device; @@ -6211,46 +6323,39 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st length:size options:MTLResourceStorageModeShared]; - size_t tensor_offset = (uintptr_t)tensor->data + offset; - - // find which Metal buffer contains this tensor - we will copy into that buffer - for (int i = 0; i < buf_ctx->n_buffers; i++) { - if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && - tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); - const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; + if (buf_dst == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } - id buf_dst = buf_ctx->buffers[i].metal; + buf_dst_offset += offset; - // queue the copy operation into the queue of the Metal context - // this will be queued at the end, after any currently ongoing GPU operations - id cmd_buf = [ctx->queue commandBuffer]; - [cmd_buf enqueue]; + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; - id encoder = [cmd_buf blitCommandEncoder]; + id encoder = [cmd_buf blitCommandEncoder]; - [encoder copyFromBuffer:buf_src - sourceOffset:0 - toBuffer:buf_dst - destinationOffset:buf_dst_offset + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset size:size]; - [encoder endEncoding]; - [cmd_buf commit]; - - // do not wait here for completion - //[cmd_buf waitUntilCompleted]; + [encoder endEncoding]; + [cmd_buf commit]; - // instead, remember a reference to the command buffer and wait for it later if needed - [ctx->cmd_bufs_ext addObject:cmd_buf]; - ctx->cmd_buf_ext_last = cmd_buf; + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; - [cmd_buf retain]; - return; - } - } + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; - GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + [cmd_buf retain]; } } @@ -6258,10 +6363,6 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; - @autoreleasepool { id device = ctx_dev->mtl_device; @@ -6270,41 +6371,39 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st options:MTLResourceStorageModeShared deallocator:nil]; - const size_t tensor_offset = (uintptr_t)tensor->data + offset; + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); - // find which buffer contains this tensor data - for (int i = 0; i < buf_ctx->n_buffers; i++) { - if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && - tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { + if (buf_src == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } - const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; + buf_src_offset += offset; - id buf_src = buf_ctx->buffers[i].metal; + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; - id cmd_buf = [ctx->queue commandBuffer]; - [cmd_buf enqueue]; + id encoder = [cmd_buf blitCommandEncoder]; - id encoder = [cmd_buf blitCommandEncoder]; + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; - [encoder copyFromBuffer:buf_src - sourceOffset:buf_src_offset - toBuffer:buf_dst - destinationOffset:0 - size:size]; + [encoder endEncoding]; + [cmd_buf commit]; - [encoder endEncoding]; - [cmd_buf commit]; - //[cmd_buf waitUntilCompleted]; + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; - [ctx->cmd_bufs_ext addObject:cmd_buf]; - ctx->cmd_buf_ext_last = cmd_buf; + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; - [cmd_buf retain]; - return; - } - } - - GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + [cmd_buf retain]; } } @@ -6513,8 +6612,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g props->type = ggml_backend_metal_device_get_type(dev); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = (struct ggml_backend_dev_caps) { - /* .async = */ false, - /* .host_buffer = */ false, + /* .async = */ true, + /* .host_buffer = */ true, /* .buffer_from_host_ptr = */ true, /* .events = */ false, }; @@ -6554,7 +6653,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back ctx->all_data = ptr; ctx->all_size = size; - ctx->owned = false; + ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); @@ -6577,6 +6676,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { ctx->buffers[ctx->n_buffers].data = ptr; From 7fc2b3d5038544019a32ccc2a55d317dd1ff9adf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 14:28:34 +0300 Subject: [PATCH 6/9] metal : restore .alloc_buffer for buffer_from_ptr_type ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 9dab47a987a81..fbaeedc463ff3 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -6246,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ NULL, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes From f288225d42fbbbc513ff94a777f79bc60938d51e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 14:29:54 +0300 Subject: [PATCH 7/9] metal : remove broken implementation of GGML_OP_SET ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 76 ---------------------------- ggml/src/ggml-metal/ggml-metal.metal | 32 ------------ 2 files changed, 108 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index fbaeedc463ff3..c6027489a422d 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -477,8 +477,6 @@ - (void) dealloc { GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, - GGML_METAL_KERNEL_TYPE_SET_I32, - GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -1411,8 +1409,6 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -2012,16 +2008,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } - case GGML_OP_SET: - { - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_I32: - return true; - default: - return false; - }; - } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -5597,68 +5583,6 @@ static int ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; - case GGML_OP_SET: - { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - - // src0 and dst as viewed during set - const size_t dst_nb0 = ggml_element_size(src0); - - const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; - const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; - const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; - const size_t offset = ((int32_t *) dst->op_params)[3]; - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); - } - - const int im0 = (ne10 == 0 ? 0 : ne10-1); - const int im1 = (ne11 == 0 ? 0 : ne11-1); - const int im2 = (ne12 == 0 ? 0 : ne12-1); - const int im3 = (ne13 == 0 ? 0 : ne13-1); - - GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); - - id pipeline = nil; - - switch (src0t) { - case GGML_TYPE_F32: - GGML_ASSERT(nb10 == sizeof(float)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; - case GGML_TYPE_I32: - GGML_ASSERT(nb10 == sizeof(int32_t)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - ggml_metal_kargs_set args = { - /*.ne10 =*/ ne10, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.nb1 =*/ dst_nb1, - /*.nb2 =*/ dst_nb2, - /*.nb3 =*/ dst_nb3, - /*.offs =*/ offset, - /*.inplace =*/ inplace, - }; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); - - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 77be3c5c9d8be..157d0cc6d0b25 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5571,38 +5571,6 @@ kernel void kernel_flash_attn_ext_vec_reduce( #undef DV } -template -kernel void kernel_set( - constant ggml_metal_kargs_set & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i13 = tgpig[2]; - const int i12 = tgpig[1]; - const int i11 = tgpig[0]; - - const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; - - const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); - const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); - const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; - - device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); - - for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { - device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); - dst_data[i10] = (T) src[0]; - } -} - -typedef decltype(kernel_set) kernel_set_t; - -template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; -template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; - template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, From 0926cb492dba83ca70150c6189d42b8fc97eae49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 14:58:59 +0300 Subject: [PATCH 8/9] metal : clean-up loose ends, ready for tests ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 64 +++++++++++++++++--------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index c6027489a422d..4a67046c97743 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,10 +44,9 @@ // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; - id mtl_queue; - id mtl_library; + id mtl_device; + int mtl_device_ref_count; + id mtl_library; NSLock * mtl_lock; @@ -69,7 +68,6 @@ } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, - /*.mtl_queue =*/ nil, /*.mtl_library =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, @@ -96,9 +94,6 @@ ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { - ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; - [ctx->mtl_queue retain]; - ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -166,11 +161,6 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } - if (ctx->mtl_queue) { - [ctx->mtl_queue release]; - ctx->mtl_queue = nil; - } - if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -1013,7 +1003,7 @@ @implementation GGMLMetalClass GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = ctx_dev->mtl_queue; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1719,7 +1709,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { id rset; id device; - id queue; }; // rset init @@ -5882,7 +5871,8 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id cmd_buf = [ctx->queue commandBuffer]; + id queue = [ctx->device newCommandQueue]; + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5899,6 +5889,10 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer [cmd_buf commit]; [cmd_buf waitUntilCompleted]; + + // note: not sure why this release is necessary as we are inside an autoreleasepool block + // but without it, we get "Context leak detected" warnings + [queue release]; } #else memset((char *)tensor->data + offset, value, size); @@ -5912,7 +5906,8 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id cmd_buf = [ctx->queue commandBuffer]; + id queue = [ctx->device newCommandQueue]; + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5936,6 +5931,8 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s [cmd_buf commit]; [cmd_buf waitUntilCompleted]; + + [queue release]; } #else memcpy((char *)tensor->data + offset, data, size); @@ -5949,7 +5946,8 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id cmd_buf = [ctx->queue commandBuffer]; + id queue = [ctx->device newCommandQueue]; + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5973,6 +5971,8 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c [cmd_buf commit]; [cmd_buf waitUntilCompleted]; + + [queue release]; } #else memcpy(data, (const char *)tensor->data + offset, size); @@ -5993,10 +5993,12 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c } static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +#if 1 struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id cmd_buf = [ctx->queue commandBuffer]; + id queue = [ctx->device newCommandQueue]; + id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -6008,9 +6010,12 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ [cmd_buf commit]; [cmd_buf waitUntilCompleted]; - } - //memset(ctx->all_data, value, ctx->all_size); + [queue release]; + } +#else + memset(ctx->all_data, value, ctx->all_size); +#endif } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { @@ -6075,30 +6080,26 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba id device = ctx_dev->mtl_device; #if 1 - // TODO: tmp hack - static void * p_base = (void *) 0x000000400ULL; - - ctx->all_data = p_base; - - p_base = (void *) ((uintptr_t) p_base + size_aligned); + // we'll populate this after creating the Metal buffer below + ctx->all_data = (void *) 0x000000400ULL; #else ctx->all_data = ggml_metal_host_malloc(size_aligned); #endif ctx->all_size = size_aligned; ctx->device = device; - ctx->queue = ctx_dev->mtl_queue; ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; ctx->buffers[0].size = size; ctx->buffers[0].metal = nil; if (size_aligned > 0) { #if 1 ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + + ctx->all_data = (void *) (ctx->buffers[0].metal.gpuAddress); #else ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned @@ -6106,6 +6107,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba deallocator:nil]; #endif } + + ctx->buffers[0].data = ctx->all_data; } if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { @@ -6167,6 +6170,8 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { } static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { + // note: not obvious, but this buffer type still needs to implement .alloc_buffer: + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099 static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, @@ -6601,7 +6606,6 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; ctx->device = device; - ctx->queue = ctx_dev->mtl_queue; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { From 3f62ee8bee97deae630c2e0b58b550bcf7fa69cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 17:06:46 +0300 Subject: [PATCH 9/9] metal : back to a single queue per device ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 51 ++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 4a67046c97743..5cd8893eede63 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,6 +48,8 @@ int mtl_device_ref_count; id mtl_library; + id mtl_queue; + NSLock * mtl_lock; bool has_simdgroup_reduction; @@ -69,6 +71,7 @@ /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_queue =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, @@ -94,6 +97,8 @@ ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -1003,7 +1013,7 @@ @implementation GGMLMetalClass GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1670,8 +1680,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; - for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { if (ctx->cmd_bufs[i].obj) { [ctx->cmd_bufs[i].obj release]; @@ -1709,6 +1717,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { id rset; id device; + id queue; }; // rset init @@ -5776,6 +5785,8 @@ static enum ggml_status ggml_metal_graph_compute( dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); + [ctx->cmd_buf_last waitUntilScheduled]; + // for debugging: block until graph is computed //[ctx->cmd_buf_last waitUntilCompleted]; @@ -5871,7 +5882,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5888,11 +5899,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer [encoder endEncoding]; [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - // note: not sure why this release is necessary as we are inside an autoreleasepool block - // but without it, we get "Context leak detected" warnings - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memset((char *)tensor->data + offset, value, size); @@ -5906,15 +5913,16 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; // TODO: is this an extra copy? can we avoid it? - id buf_src = [ctx->device newBufferWithBytes:data - length:size - options:MTLResourceStorageModeShared]; + id buf_src = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; size_t buf_dst_offset = 0; id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); @@ -5929,10 +5937,9 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s [encoder endEncoding]; + // note: no need to wait for completion here [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memcpy((char *)tensor->data + offset, data, size); @@ -5946,7 +5953,7 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5971,8 +5978,6 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c [cmd_buf commit]; [cmd_buf waitUntilCompleted]; - - [queue release]; } #else memcpy(data, (const char *)tensor->data + offset, size); @@ -5997,7 +6002,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -6009,9 +6014,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ [encoder endEncoding]; [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memset(ctx->all_data, value, ctx->all_size); @@ -6088,6 +6091,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ctx->all_size = size_aligned; ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; ctx->n_buffers = 1; @@ -6606,6 +6610,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) {