wip

ggerganov · ggerganov · commit 6d5cb567bf2a · 2024-09-30T17:30:32.000+03:00
diff --git a/examples/perf-metal/perf-metal.cpp b/examples/perf-metal/perf-metal.cpp
@@ -107,8 +107,10 @@ int main(int argc, char ** argv) {
         if (n_thread == 4) {
             ggml_backend_metal_capture_next_compute(backend);
             ggml_backend_graph_compute(backend, gf);
+            //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // NOTE: these intervals do not appear in the XCode trace!
             ggml_backend_metal_capture_next_compute(backend);
             ggml_backend_graph_compute(backend, gf);
+            //std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // NOTE: these intervals do not appear in the XCode trace!
             ggml_backend_metal_capture_next_compute(backend);
             ggml_backend_graph_compute(backend, gf);
 
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -236,6 +236,8 @@
     bool should_capture_next_compute;
     bool capture_started;
 
+    id<MTLCaptureScope> cap_scope;
+
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
     void *              abort_callback_data;
@@ -459,6 +461,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     ctx->should_capture_next_compute = false;
     ctx->capture_started = false;
 
+    ctx->cap_scope = nil;
+
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
         GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
@@ -887,17 +891,21 @@ static enum ggml_status ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_nodes = gf->n_nodes;
+    const int n_nodes_0 = MIN(64, gf->n_nodes);
+    const int n_nodes_1 = gf->n_nodes - n_nodes_0;
     const int n_cb = ctx->n_cb;
-    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+    const int n_nodes_per_cb = (n_nodes_1 + n_cb - 1) / n_cb;
 
     const bool should_capture = ctx->should_capture_next_compute;
     if (should_capture) {
         ctx->should_capture_next_compute = false;
 
         if (!ctx->capture_started) {
+            // create capture scope
+            ctx->cap_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+
             MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
-            descriptor.captureObject = ctx->queue;
+            descriptor.captureObject = ctx->cap_scope;
             descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
             descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
 
@@ -906,26 +914,17 @@ static enum ggml_status ggml_metal_graph_compute(
                 GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
                 GGML_ABORT("capture failed");
             } else {
+                [ctx->cap_scope beginScope];
                 ctx->capture_started = true;
             }
         }
     }
 
-    id<MTLCommandBuffer> command_buffer_builder[n_cb];
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
-        command_buffer_builder[cb_idx] = command_buffer;
-
-        // always enqueue the first two command buffers
-        // enqueue all of the command buffers if we don't need to abort
-        if (cb_idx < 2 || ctx->abort_callback == NULL) {
-            [command_buffer enqueue];
-        }
-    }
-
-    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
+    id<MTLCommandBuffer> command_buffer_builder[n_cb + 1];
+    const id<MTLCommandBuffer> * command_buffers = command_buffer_builder;
 
-    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
+    //dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
+    void (^helper)(size_t iter) = ^(size_t iter) {
         const int cb_idx = iter;
 
         size_t offs_src0 = 0;
@@ -936,8 +935,12 @@ static enum ggml_status ggml_metal_graph_compute(
         id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
         id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
 
-        const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-        const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
+        int node_start = 0;
+        int node_end   = n_nodes_0;
+        if ((int) iter < n_cb) {
+            node_start = n_nodes_0 + (                                       (cb_idx + 0) * n_nodes_per_cb);
+            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
+        }
 
         for (int i = node_start; i < node_end; ++i) {
             if (i == -1) {
@@ -3037,11 +3040,36 @@ static enum ggml_status ggml_metal_graph_compute(
         if (cb_idx < 2 || ctx->abort_callback == NULL) {
             [command_buffer commit];
         }
-    });
+    };
+
+    {
+        id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
+        command_buffer_builder[n_cb] = command_buffer;
+        [command_buffer enqueue];
+        helper(n_cb);
+    }
+
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
+        command_buffer_builder[cb_idx] = command_buffer;
+
+        // always enqueue the first two command buffers
+        // enqueue all of the command buffers if we don't need to abort
+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
+            [command_buffer enqueue];
+        }
+    }
+
+    dispatch_apply(n_cb, ctx->d_queue, helper);
 
     // Wait for completion and check status of each command buffer
     // needed to detect if the device ran out-of-memory for example (#1881)
 
+    {
+        id<MTLCommandBuffer> command_buffer = command_buffers[n_cb];
+        [command_buffer waitUntilCompleted];
+    }
+
     for (int i = 0; i < n_cb; ++i) {
         id<MTLCommandBuffer> command_buffer = command_buffers[i];
         [command_buffer waitUntilCompleted];
@@ -3075,6 +3103,7 @@ static enum ggml_status ggml_metal_graph_compute(
     }
 
     if (!should_capture && ctx->capture_started) {
+        [ctx->cap_scope endScope];
         [[MTLCaptureManager sharedCaptureManager] stopCapture];
     }