1919// max number of MTLCommandBuffer used to submit a graph for processing
2020#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121
22+ #define GGML_METAL_MAX_RESIDENCY_SETS 128
23+
2224#define UNUSED (x ) (void )(x)
2325
2426// globals
3739 id <MTLDevice > mtl_device;
3840 int mtl_device_ref_count;
3941
42+ id <MTLResidencySet> mtl_residency_set[GGML_METAL_MAX_RESIDENCY_SETS];
43+ int mtl_residency_set_n;
44+
4045 bool has_simdgroup_reduction;
4146 bool has_simdgroup_mm;
4247 bool has_bfloat;
4651} g_ggml_ctx_dev_main = {
4752 /* .mtl_device =*/ nil ,
4853 /* .mtl_device_ref_count =*/ 0 ,
54+ /* .mtl_residency_set =*/ { nil },
55+ /* .mtl_residency_set_n =*/ 0 ,
4956 /* .has_simdgroup_reduction =*/ false ,
5057 /* .has_simdgroup_mm =*/ false ,
5158 /* .has_bfloat =*/ false ,
@@ -95,6 +102,41 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
95102 }
96103}
97104
105+ // add residency set
106+ static bool ggml_backend_metal_device_add_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
107+ assert (ctx != NULL );
108+ assert (queue != nil );
109+
110+ if (ctx->mtl_residency_set_n >= GGML_METAL_MAX_RESIDENCY_SETS) {
111+ GGML_LOG_ERROR (" %s : warning: maximum number of residency sets reached\n " , __func__);
112+ return false ;
113+ }
114+
115+ ctx->mtl_residency_set [ctx->mtl_residency_set_n++] = residency_set;
116+
117+ return true ;
118+ }
119+
120+ // remove residency set
121+ static bool ggml_backend_metal_device_remove_residency_set (struct ggml_backend_metal_device_context * ctx, id <MTLResidencySet> residency_set) {
122+ assert (ctx != NULL );
123+ assert (residency_set != nil );
124+
125+ for (int i = 0 ; i < ctx->mtl_residency_set_n ; ++i) {
126+ if (ctx->mtl_residency_set [i] == residency_set) {
127+ for (int j = i; j < ctx->mtl_residency_set_n - 1 ; ++j) {
128+ ctx->mtl_residency_set [j] = ctx->mtl_residency_set [j + 1 ];
129+ }
130+
131+ ctx->mtl_residency_set_n --;
132+
133+ return true ;
134+ }
135+ }
136+
137+ return false ;
138+ }
139+
98140// kernels
99141
100142struct ggml_metal_kernel {
@@ -483,6 +525,11 @@ @implementation GGMLMetalClass
483525 GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484526
485527 ctx->queue = [device newCommandQueue ];
528+ if (ctx->queue == nil ) {
529+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
530+ return NULL ;
531+ }
532+
486533 ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487534
488535 id <MTLLibrary > metal_library;
@@ -1035,6 +1082,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351082 // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361083 int n_buffers;
10371084 struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1085+
1086+ id <MTLResidencySet> residency_set;
10381087};
10391088
10401089// finds the Metal buffer that contains the tensor data on the GPU device
@@ -4039,6 +4088,23 @@ static enum ggml_status ggml_metal_graph_compute(
40394088 struct ggml_backend_metal_context * ctx = backend->context ;
40404089 struct ggml_backend_metal_device_context * ctx_dev = backend->device ->context ;
40414090
4091+ // attached residency sets to the queue on the first run
4092+ // also tested to attached them on each run, but it does not make a difference
4093+ static bool is_first = true ;
4094+ if (is_first) {
4095+ is_first = false ;
4096+ GGML_LOG_INFO (" %s : adding %d residency sets\n " , __func__, ctx_dev->mtl_residency_set_n );
4097+ [ctx->queue addResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4098+ }
4099+
4100+ // this does not make a difference
4101+ // for (int i = 0; i < ctx_dev->mtl_residency_set_n; ++i) {
4102+ // GGML_LOG_INFO("%s: residency set %d allocations size = %zu\n", __func__, i, [ctx_dev->mtl_residency_set[i] allocatedSize]);
4103+ // [ctx_dev->mtl_residency_set[i] requestResidency];
4104+ // }
4105+
4106+ int64_t t_start_us = ggml_time_us ();
4107+
40424108 // number of nodes encoded by the main thread (empirically determined)
40434109 const int n_main = 128 ;
40444110
@@ -4086,19 +4152,25 @@ static enum ggml_status ggml_metal_graph_compute(
40864152 // the main thread commits the first few commands immediately
40874153 // command_buffer[n_cb]
40884154 {
4089- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4155+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
40904156 ctx->command_buffers [n_cb] = command_buffer;
40914157
4158+ // does not make a difference
4159+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4160+
40924161 [command_buffer enqueue ];
40934162 ctx->encode_async (n_cb);
40944163 }
40954164
40964165 // prepare the rest of the command buffers asynchronously
40974166 // command_buffer[0.. n_cb)
40984167 for (int cb_idx = 0 ; cb_idx < n_cb; ++cb_idx) {
4099- id <MTLCommandBuffer > command_buffer = [ctx->queue commandBufferWithUnretainedReferences ];
4168+ id <MTLCommandBuffer > command_buffer = [ctx->queue commandBuffer ];
41004169 ctx->command_buffers [cb_idx] = command_buffer;
41014170
4171+ // does not make a difference
4172+ [command_buffer useResidencySets: ctx_dev->mtl_residency_set count: ctx_dev->mtl_residency_set_n];
4173+
41024174 // always enqueue the first two command buffers
41034175 // enqueue all of the command buffers if we don't need to abort
41044176 if (cb_idx < 2 || ctx->abort_callback == NULL ) {
@@ -4163,6 +4235,10 @@ static enum ggml_status ggml_metal_graph_compute(
41634235 }
41644236 }
41654237
4238+ int64_t t_end_us = ggml_time_us ();
4239+
4240+ GGML_LOG_DEBUG (" %s : compute graph took %8.2f ms\n " , __func__, (t_end_us - t_start_us) / 1000.0 );
4241+
41664242 return GGML_STATUS_SUCCESS;
41674243}
41684244
@@ -4176,6 +4252,13 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764252 for (int i = 0 ; i < ctx->n_buffers ; i++) {
41774253 [ctx->buffers[i].metal release ];
41784254 }
4255+
4256+ ggml_backend_metal_device_remove_residency_set (buffer->buft ->device ->context , ctx->residency_set );
4257+
4258+ [ctx->residency_set endResidency ];
4259+ [ctx->residency_set removeAllAllocations ];
4260+ [ctx->residency_set release ];
4261+
41794262 ggml_backend_metal_device_rel (buffer->buft ->device ->context );
41804263
41814264 if (ctx->owned ) {
@@ -4284,7 +4367,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844367 size_aligned += (size_page - (size_aligned % size_page));
42854368 }
42864369
4287- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4370+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4371+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
42884372
42894373 ctx->all_data = ggml_metal_host_malloc (size_aligned);
42904374 ctx->all_size = size_aligned;
@@ -4307,10 +4391,34 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074391 if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
43084392 GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
43094393 free (ctx);
4310- ggml_backend_metal_device_rel (buft-> device -> context );
4394+ ggml_backend_metal_device_rel (ctx_dev );
43114395 return NULL ;
43124396 }
43134397
4398+ {
4399+ MTLResidencySetDescriptor * desc;
4400+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4401+ desc.label = @" Primary residency set" ;
4402+ desc.initialCapacity = ctx->n_buffers ;
4403+
4404+ NSError *error;
4405+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4406+ if (error) {
4407+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4408+ return NULL ;
4409+ }
4410+
4411+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4412+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4413+ }
4414+
4415+ [ctx->residency_set commit ];
4416+ [ctx->residency_set requestResidency ];
4417+
4418+ // track the residency set in the device context
4419+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4420+ }
4421+
43144422 // ggml_backend_metal_log_allocated_size(device, size_aligned);
43154423
43164424 return ggml_backend_buffer_init (buft, ggml_backend_metal_buffer_i, ctx, size);
@@ -4400,7 +4508,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004508 size_aligned += (size_page - (size_aligned % size_page));
44014509 }
44024510
4403- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4511+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4512+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
44044513
44054514 // the buffer fits into the max buffer size allowed by the device
44064515 if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4562,30 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534562 }
44544563 }
44554564
4565+ {
4566+ MTLResidencySetDescriptor * desc;
4567+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4568+ desc.label = @" Primary residency set" ;
4569+ desc.initialCapacity = ctx->n_buffers ;
4570+
4571+ NSError *error;
4572+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4573+ if (error) {
4574+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4575+ return NULL ;
4576+ }
4577+
4578+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4579+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4580+ }
4581+
4582+ [ctx->residency_set commit ];
4583+ [ctx->residency_set requestResidency ];
4584+
4585+ // track the residency set in the device context
4586+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4587+ }
4588+
44564589 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
44574590}
44584591
@@ -4766,6 +4899,30 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664899 }
47674900 }
47684901
4902+ {
4903+ MTLResidencySetDescriptor * desc;
4904+ desc = [[MTLResidencySetDescriptor alloc ] init ];
4905+ desc.label = @" Primary residency set" ;
4906+ desc.initialCapacity = ctx->n_buffers ;
4907+
4908+ NSError *error;
4909+ ctx->residency_set = [device newResidencySetWithDescriptor: desc error: &error];
4910+ if (error) {
4911+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
4912+ return NULL ;
4913+ }
4914+
4915+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
4916+ [ctx->residency_set addAllocation: ctx->buffers[i].metal];
4917+ }
4918+
4919+ [ctx->residency_set commit ];
4920+ [ctx->residency_set requestResidency ];
4921+
4922+ // track the residency set in the device context
4923+ ggml_backend_metal_device_add_residency_set (ctx_dev, ctx->residency_set );
4924+ }
4925+
47694926 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
47704927}
47714928
0 commit comments