wip [no ci]

ggerganov · ggerganov · commit 6d78c5e80724 · 2025-09-15T17:08:04.000+03:00
diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h
@@ -63,6 +63,23 @@ void ggml_metal_set_abort_callback  (ggml_metal_t ctx, ggml_abort_callback abort
 bool ggml_metal_supports_family     (ggml_metal_t ctx, int family);
 void ggml_metal_capture_next_compute(ggml_metal_t ctx);
 
+//
+// encoder
+//
+
+typedef struct ggml_metal_encoder * ggml_metal_encoder_t;
+
+ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_t ctx, int cb_idx);
+void ggml_metal_encoder_free(ggml_metal_encoder_t ctx);
+
+void ggml_metal_encoder_begin (ggml_metal_encoder_t ctx, int idx);
+void ggml_metal_encoder_encode(ggml_metal_encoder_t ctx, int idx, int node_end);
+void ggml_metal_encoder_end   (ggml_metal_encoder_t ctx, int idx);
+
+bool ggml_metal_encoder_concurrency_reset(ggml_metal_encoder_t ctx);
+bool ggml_metal_encoder_concurrency_check(ggml_metal_encoder_t ctx, const struct ggml_tensor * node);
+bool ggml_metal_encoder_concurrency_add  (ggml_metal_encoder_t ctx, const struct ggml_tensor * node);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -379,7 +379,6 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
     id<MTLLibrary>      library;
     id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
 
-    //struct ggml_metal_device_props props_dev;
     ggml_metal_device_t ctx_dev;
 
     dispatch_queue_t d_queue;
@@ -1062,15 +1061,47 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
     }
 }
 
-struct ggml_metal_encode_context {
-    id<MTLComputeCommandEncoder> encoder;
-
+struct ggml_metal_encoder {
     ggml_metal_t ctx;
 
+    id<MTLComputeCommandEncoder> encoder;
+
     ggml_mem_ranges_t mem_ranges;
 };
 
-static bool ggml_metal_encode_concurrency_reset(struct ggml_metal_encode_context * ctx) {
+ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_t ctx, int cb_idx) {
+    ggml_metal_encoder_t res = calloc(1, sizeof(struct ggml_metal_encoder));
+    res->ctx = ctx;
+
+    id<MTLCommandBuffer> cmd_buf = [ctx->que
+
+    if (ctx->use_concurrency) {
+        res->encoder = [ctx->queue computeCommandEncoder];
+        res->mem_ranges = ggml_mem_ranges_init(ctx->debug_graph);
+    } else {
+        res->mem_ranges = nil;
+    }
+
+}
+
+
+void ggml_metal_encoder_free(ggml_metal_encoder_t ctx);
+
+void ggml_metal_encoder_begin(ggml_metal_encoder_t ctx, int idx) {
+    if (ctx->ctx->capture_next_compute) {
+        [ctx->encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
+    }
+}
+
+void ggml_metal_encoder_end(ggml_metal_encoder_t ctx, int idx) {
+    if (ctx->ctx->capture_next_compute) {
+        [ctx->encoder popDebugGroup];
+    }
+
+    GGML_UNUSED(idx);
+}
+
+bool ggml_metal_encoder_concurrency_reset(struct ggml_metal_encoder * ctx) {
     if (!ctx->mem_ranges) {
         return true;
     }
@@ -1082,23 +1113,23 @@ static bool ggml_metal_encode_concurrency_reset(struct ggml_metal_encode_context
     return true;
 }
 
-static bool ggml_metal_encode_concurrency_check(struct ggml_metal_encode_context * ctx, const struct ggml_tensor * node) {
+bool ggml_metal_encoder_concurrency_check(struct ggml_metal_encoder * ctx, const struct ggml_tensor * node) {
     if (!ctx->mem_ranges) {
         return false;
     }
 
     return ggml_mem_ranges_check(ctx->mem_ranges, node);
 }
 
-static bool ggml_metal_encode_concurrency_add(struct ggml_metal_encode_context * ctx, const struct ggml_tensor * node) {
+bool ggml_metal_encoder_concurrency_add(struct ggml_metal_encoder * ctx, const struct ggml_tensor * node) {
     if (!ctx->mem_ranges) {
         return true;
     }
 
     return ggml_mem_ranges_add(ctx->mem_ranges, node);
 }
 
-static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, int idx, int idx_end) {
+static int ggml_metal_encoder_node(struct ggml_metal_encoder * ctx_enc, int idx, int idx_end) {
     id<MTLComputeCommandEncoder> encoder = ctx_enc->encoder;
 
     ggml_metal_t ctx = ctx_enc->ctx;
@@ -1221,10 +1252,10 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
     // otherwise, we add the new ranges to the encoding context and process the node concurrently
     //
     {
-        const bool is_concurrent = ggml_metal_encode_concurrency_check(ctx_enc, node);
+        const bool is_concurrent = ggml_metal_encoder_concurrency_check(ctx_enc, node);
 
         if (!is_concurrent) {
-            ggml_metal_encode_concurrency_reset(ctx_enc);
+            ggml_metal_encoder_concurrency_reset(ctx_enc);
         }
 
         if (ctx->debug_graph > 0) {
@@ -1407,8 +1438,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
                     id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
 
                     for (int i = 1; i < n_fuse; ++i) {
-                        if (!ggml_metal_encode_concurrency_check(ctx_enc, nodes[i])) {
-                            ggml_metal_encode_concurrency_reset(ctx_enc);
+                        if (!ggml_metal_encoder_concurrency_check(ctx_enc, nodes[i])) {
+                            ggml_metal_encoder_concurrency_reset(ctx_enc);
 
                             break;
                         }
@@ -1557,7 +1588,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
 
                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
 
-                    ggml_metal_encode_concurrency_reset(ctx_enc);
+                    ggml_metal_encoder_concurrency_reset(ctx_enc);
                 }
 
                 ggml_metal_kargs_bin args = {
@@ -3025,7 +3056,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
                     }
 
                     // this barrier is always needed because the next kernel has to wait for the id maps to be computed
-                    ggml_metal_encode_concurrency_reset(ctx_enc);
+                    ggml_metal_encoder_concurrency_reset(ctx_enc);
 
                     {
                         id<MTLComputePipelineState> pipeline = nil;
@@ -3497,8 +3528,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
                     id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
 
                     for (int i = 1; i < n_fuse; ++i) {
-                        if (!ggml_metal_encode_concurrency_check(ctx_enc, nodes[i])) {
-                            ggml_metal_encode_concurrency_reset(ctx_enc);
+                        if (!ggml_metal_encoder_concurrency_check(ctx_enc, nodes[i])) {
+                            ggml_metal_encoder_concurrency_reset(ctx_enc);
 
                             break;
                         }
@@ -4404,7 +4435,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
                         [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
 
                         // sync the 2 kernels
-                        ggml_metal_encode_concurrency_reset(ctx_enc);
+                        ggml_metal_encoder_concurrency_reset(ctx_enc);
 
                         // reduce the results from the workgroups
                         {
@@ -4678,8 +4709,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
 
     // update the mem ranges in the encoding context
     for (int i = 0; i < n_fuse; ++i) {
-        if (!ggml_metal_encode_concurrency_add(ctx_enc, nodes[i])) {
-            ggml_metal_encode_concurrency_reset(ctx_enc);
+        if (!ggml_metal_encoder_concurrency_add(ctx_enc, nodes[i])) {
+            ggml_metal_encoder_concurrency_reset(ctx_enc);
         }
     }
 
@@ -4900,9 +4931,9 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
-        struct ggml_metal_encode_context ctx_enc = {
-            /*.encoder    =*/ encoder,
+        struct ggml_metal_encoder ctx_enc = {
             /*.ctx        =*/ ctx,
+            /*.encoder    =*/ encoder,
             /*.mem_ranges =*/ mem_ranges,
         };
 
@@ -4911,7 +4942,7 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            const int res = ggml_metal_encode_node(&ctx_enc, idx, node_end);
+            const int res = ggml_metal_encoder_node(&ctx_enc, idx, node_end);
             if (idx + res > node_end) {
                 GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
                         "https://github.com/ggml-org/llama.cpp/pull/14849");