metal : simplify

ggerganov · ggerganov · commit 898c6c862906 · 2025-01-26T15:40:41.000+02:00
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -19,8 +19,6 @@
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
 
-#define GGML_METAL_MAX_RESIDENCY_SETS 128
-
 #define UNUSED(x) (void)(x)
 
 // globals
@@ -39,9 +37,6 @@
     id<MTLDevice> mtl_device;
     int           mtl_device_ref_count;
 
-    id<MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
-    int                 mtl_residency_set_n;
-
     bool has_simdgroup_reduction;
     bool has_simdgroup_mm;
     bool has_bfloat;
@@ -51,8 +46,6 @@
 } g_ggml_ctx_dev_main = {
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
-    /*.mtl_residency_set       =*/ { nil },
-    /*.mtl_residency_set_n     =*/ 0,
     /*.has_simdgroup_reduction =*/ false,
     /*.has_simdgroup_mm        =*/ false,
     /*.has_bfloat              =*/ false,
@@ -102,41 +95,6 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     }
 }
 
-// add residency set
-static bool ggml_backend_metal_device_add_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
-    assert(ctx != NULL);
-    assert(queue != nil);
-
-    if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
-        GGML_LOG_ERROR("%s: warning: maximum number of residency sets reached\n", __func__);
-        return false;
-    }
-
-    ctx->mtl_residency_set[ctx->mtl_residency_set_n++] = residency_set;
-
-    return true;
-}
-
-// remove residency set
-static bool ggml_backend_metal_device_remove_residency_set(struct ggml_backend_metal_device_context * ctx, id<MTLResidencySet> residency_set) {
-    assert(ctx != NULL);
-    assert(residency_set != nil);
-
-    for (int i = 0; i < ctx->mtl_residency_set_n; ++i) {
-        if (ctx->mtl_residency_set[i] == residency_set) {
-            for (int j = i; j < ctx->mtl_residency_set_n - 1; ++j) {
-                ctx->mtl_residency_set[j] = ctx->mtl_residency_set[j + 1];
-            }
-
-            ctx->mtl_residency_set_n--;
-
-            return true;
-        }
-    }
-
-    return false;
-}
-
 // kernels
 
 struct ggml_metal_kernel {
@@ -1083,7 +1041,7 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     int n_buffers;
     struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
-    id<MTLResidencySet> residency_set;
+    id<MTLResidencySet> rset;
 };
 
 // finds the Metal buffer that contains the tensor data on the GPU device
@@ -4088,21 +4046,6 @@ static enum ggml_status ggml_metal_graph_compute(
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
-    // attached residency sets to the queue on the first run
-    // also tested to attached them on each run, but it does not make a difference
-    static bool is_first = true;
-    if (is_first) {
-        is_first = false;
-        GGML_LOG_INFO("%s: adding %d residency sets\n", __func__, ctx_dev->mtl_residency_set_n);
-        [ctx->queue addResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
-    }
-
-    // this does not make a difference
-    //for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
-    //    GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
-    //    [ctx_dev->mtl_residency_set[i] requestResidency];
-    //}
-
     int64_t t_start_us = ggml_time_us();
 
     // number of nodes encoded by the main thread (empirically determined)
@@ -4155,9 +4098,6 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
             ctx->command_buffers[n_cb] = command_buffer;
 
-            // does not make a difference
-            [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
-
             [command_buffer enqueue];
             ctx->encode_async(n_cb);
         }
@@ -4168,9 +4108,6 @@ static enum ggml_status ggml_metal_graph_compute(
             id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
             ctx->command_buffers[cb_idx] = command_buffer;
 
-            // does not make a difference
-            [command_buffer useResidencySets:ctx_dev->mtl_residency_set count:ctx_dev->mtl_residency_set_n];
-
             // always enqueue the first two command buffers
             // enqueue all of the command buffers if we don't need to abort
             if (cb_idx < 2 || ctx->abort_callback == NULL) {
@@ -4253,11 +4190,9 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
         [ctx->buffers[i].metal release];
     }
 
-    ggml_backend_metal_device_remove_residency_set(buffer->buft->device->context, ctx->residency_set);
-
-    [ctx->residency_set endResidency];
-    [ctx->residency_set removeAllAllocations];
-    [ctx->residency_set release];
+    [ctx->rset endResidency];
+    [ctx->rset removeAllAllocations];
+    [ctx->rset release];
 
     ggml_backend_metal_device_rel(buffer->buft->device->context);
 
@@ -4398,25 +4333,22 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     {
         MTLResidencySetDescriptor * desc;
         desc = [[MTLResidencySetDescriptor alloc] init];
-        desc.label = @"Primary residency set";
+        desc.label = @"ggml_backend_metal";
         desc.initialCapacity = ctx->n_buffers;
 
-        NSError *error;
-        ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
+        NSError * error;
+        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
         if (error) {
             GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
             return NULL;
         }
 
         for (int i = 0; i < ctx->n_buffers; i++) {
-            [ctx->residency_set addAllocation:ctx->buffers[i].metal];
+            [ctx->rset addAllocation:ctx->buffers[i].metal];
         }
 
-        [ctx->residency_set commit];
-        [ctx->residency_set requestResidency];
-
-        // track the residency set in the device context
-        ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
+        [ctx->rset commit];
+        [ctx->rset requestResidency];
     }
 
     //ggml_backend_metal_log_allocated_size(device, size_aligned);
@@ -4565,25 +4497,22 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
     {
         MTLResidencySetDescriptor * desc;
         desc = [[MTLResidencySetDescriptor alloc] init];
-        desc.label = @"Primary residency set";
+        desc.label = @"ggml_backend_metal";
         desc.initialCapacity = ctx->n_buffers;
 
-        NSError *error;
-        ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
+        NSError * error;
+        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
         if (error) {
             GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
             return NULL;
         }
 
         for (int i = 0; i < ctx->n_buffers; i++) {
-            [ctx->residency_set addAllocation:ctx->buffers[i].metal];
+            [ctx->rset addAllocation:ctx->buffers[i].metal];
         }
 
-        [ctx->residency_set commit];
-        [ctx->residency_set requestResidency];
-
-        // track the residency set in the device context
-        ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
+        [ctx->rset commit];
+        [ctx->rset requestResidency];
     }
 
     return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
@@ -4902,25 +4831,22 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
     {
         MTLResidencySetDescriptor * desc;
         desc = [[MTLResidencySetDescriptor alloc] init];
-        desc.label = @"Primary residency set";
+        desc.label = @"ggml_backend_metal";
         desc.initialCapacity = ctx->n_buffers;
 
-        NSError *error;
-        ctx->residency_set = [device newResidencySetWithDescriptor:desc error:&error];
+        NSError * error;
+        ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
         if (error) {
             GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
             return NULL;
         }
 
         for (int i = 0; i < ctx->n_buffers; i++) {
-            [ctx->residency_set addAllocation:ctx->buffers[i].metal];
+            [ctx->rset addAllocation:ctx->buffers[i].metal];
         }
 
-        [ctx->residency_set commit];
-        [ctx->residency_set requestResidency];
-
-        // track the residency set in the device context
-        ggml_backend_metal_device_add_residency_set(ctx_dev, ctx->residency_set);
+        [ctx->rset commit];
+        [ctx->rset requestResidency];
     }
 
     return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);