1919// max number of MTLCommandBuffer used to submit a graph for processing
2020#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121
22- #define UNUSED (x ) (void )(x)
22+ // create residency sets only on macOS >= 15.0
23+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25+ #endif
2326
2427// globals
2528
3942
4043 bool has_simdgroup_reduction;
4144 bool has_simdgroup_mm;
45+ bool has_residency_sets;
4246 bool has_bfloat;
4347 bool use_bfloat;
4448
4852 /* .mtl_device_ref_count =*/ 0 ,
4953 /* .has_simdgroup_reduction =*/ false ,
5054 /* .has_simdgroup_mm =*/ false ,
55+ /* .has_residency_sets =*/ false ,
5156 /* .has_bfloat =*/ false ,
5257 /* .use_bfloat =*/ false ,
5358 /* .name =*/ " " ,
6570
6671 ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily: MTLGPUFamilyApple7];
6772
73+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
74+ ctx->has_residency_sets = getenv (" GGML_METAL_NO_RESIDENCY" ) == NULL ;
75+ #endif
76+
6877 ctx->has_bfloat = [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
6978 ctx->has_bfloat |= [ctx->mtl_device supportsFamily: MTLGPUFamilyApple6];
7079
@@ -483,6 +492,11 @@ @implementation GGMLMetalClass
483492 GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484493
485494 ctx->queue = [device newCommandQueue ];
495+ if (ctx->queue == nil ) {
496+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
497+ return NULL ;
498+ }
499+
486500 ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487501
488502 id <MTLLibrary > metal_library;
@@ -649,6 +663,7 @@ @implementation GGMLMetalClass
649663
650664 GGML_LOG_INFO (" %s : simdgroup reduction = %s \n " , __func__, ctx_dev->has_simdgroup_reduction ? " true" : " false" );
651665 GGML_LOG_INFO (" %s : simdgroup matrix mul. = %s \n " , __func__, ctx_dev->has_simdgroup_mm ? " true" : " false" );
666+ GGML_LOG_INFO (" %s : has residency sets = %s \n " , __func__, ctx_dev->has_residency_sets ? " true" : " false" );
652667 GGML_LOG_INFO (" %s : has bfloat = %s \n " , __func__, ctx_dev->has_bfloat ? " true" : " false" );
653668 GGML_LOG_INFO (" %s : use bfloat = %s \n " , __func__, ctx_dev->use_bfloat ? " true" : " false" );
654669 GGML_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx_dev->mtl_device .hasUnifiedMemory ? " true" : " false" );
@@ -1035,8 +1050,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351050 // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361051 int n_buffers;
10371052 struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1053+
1054+ // optional MTLResidencySet
1055+ id rset;
10381056};
10391057
1058+ // rset init
1059+ static bool ggml_backend_metal_buffer_rset_init (
1060+ struct ggml_backend_metal_buffer_context * ctx,
1061+ struct ggml_backend_metal_device_context * ctx_dev,
1062+ id <MTLDevice > device) {
1063+ ctx->rset = nil ;
1064+
1065+ if (!ctx_dev->has_residency_sets ) {
1066+ return true ;
1067+ }
1068+
1069+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1070+ if (@available (macOS 15.0 , *)) {
1071+ MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc ] init ];
1072+ desc.label = @" ggml_backend_metal" ;
1073+ desc.initialCapacity = ctx->n_buffers ;
1074+
1075+ NSError * error;
1076+ ctx->rset = [device newResidencySetWithDescriptor: desc error: &error];
1077+ if (error) {
1078+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
1079+ [desc release ];
1080+ return false ;
1081+ }
1082+
1083+ [desc release ];
1084+
1085+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
1086+ [ctx->rset addAllocation: ctx->buffers[i].metal];
1087+ }
1088+
1089+ [ctx->rset commit ];
1090+ [ctx->rset requestResidency ];
1091+
1092+ return true ;
1093+ }
1094+ #else
1095+ GGML_UNUSED (ctx_dev);
1096+ GGML_UNUSED (device);
1097+ #endif
1098+
1099+ return true ;
1100+ }
1101+
1102+ // rset free
1103+ static void ggml_backend_metal_buffer_rset_free (struct ggml_backend_metal_buffer_context * ctx) {
1104+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1105+ if (@available (macOS 15.0 , *)) {
1106+ if (ctx->rset ) {
1107+ [ctx->rset endResidency ];
1108+ [ctx->rset removeAllAllocations ];
1109+ [ctx->rset release ];
1110+ }
1111+ }
1112+ #else
1113+ GGML_UNUSED (ctx);
1114+ #endif
1115+ }
1116+
10401117// finds the Metal buffer that contains the tensor data on the GPU device
10411118// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
10421119// Metal buffer based on the host memory pointer
@@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764253 for (int i = 0 ; i < ctx->n_buffers ; i++) {
41774254 [ctx->buffers[i].metal release ];
41784255 }
4256+
4257+ ggml_backend_metal_buffer_rset_free (ctx);
41794258 ggml_backend_metal_device_rel (buffer->buft ->device ->context );
41804259
41814260 if (ctx->owned ) {
@@ -4198,19 +4277,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41984277static void ggml_backend_metal_buffer_memset_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
41994278 memset ((char *)tensor->data + offset, value, size);
42004279
4201- UNUSED (buffer);
4280+ GGML_UNUSED (buffer);
42024281}
42034282
42044283static void ggml_backend_metal_buffer_set_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
42054284 memcpy ((char *)tensor->data + offset, data, size);
42064285
4207- UNUSED (buffer);
4286+ GGML_UNUSED (buffer);
42084287}
42094288
42104289static void ggml_backend_metal_buffer_get_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
42114290 memcpy (data, (const char *)tensor->data + offset, size);
42124291
4213- UNUSED (buffer);
4292+ GGML_UNUSED (buffer);
42144293}
42154294
42164295static bool ggml_backend_metal_buffer_cpy_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
42204299 }
42214300 return false ;
42224301
4223- UNUSED (buffer);
4302+ GGML_UNUSED (buffer);
42244303}
42254304
42264305static void ggml_backend_metal_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4246,7 +4325,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
42464325static const char * ggml_backend_metal_buffer_type_get_name (ggml_backend_buffer_type_t buft) {
42474326 return " Metal" ;
42484327
4249- UNUSED (buft);
4328+ GGML_UNUSED (buft);
42504329}
42514330
42524331static void ggml_backend_metal_log_allocated_size (id <MTLDevice > device, size_t size_aligned) {
@@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
42704349 }
42714350#endif
42724351#endif
4273- UNUSED (device);
4274- UNUSED (size_aligned);
4352+ GGML_UNUSED (device);
4353+ GGML_UNUSED (size_aligned);
42754354}
42764355
42774356static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844363 size_aligned += (size_page - (size_aligned % size_page));
42854364 }
42864365
4287- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4366+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4367+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
42884368
42894369 ctx->all_data = ggml_metal_host_malloc (size_aligned);
42904370 ctx->all_size = size_aligned;
@@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074387 if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
43084388 GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
43094389 free (ctx);
4310- ggml_backend_metal_device_rel (buft->device ->context );
4390+ ggml_backend_metal_device_rel (ctx_dev);
4391+ return NULL ;
4392+ }
4393+
4394+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4395+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4396+ free (ctx);
4397+ ggml_backend_metal_device_rel (ctx_dev);
43114398 return NULL ;
43124399 }
43134400
@@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43184405
43194406static size_t ggml_backend_metal_buffer_type_get_alignment (ggml_backend_buffer_type_t buft) {
43204407 return 32 ;
4321- UNUSED (buft);
4408+ GGML_UNUSED (buft);
43224409}
43234410
43244411static size_t ggml_backend_metal_buffer_type_get_max_size (ggml_backend_buffer_type_t buft) {
@@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
43284415
43294416 return max_size;
43304417
4331- UNUSED (buft);
4418+ GGML_UNUSED (buft);
43324419}
43334420
43344421static bool ggml_backend_metal_buffer_type_is_host (ggml_backend_buffer_type_t buft) {
43354422 return true ;
43364423
4337- UNUSED (buft);
4424+ GGML_UNUSED (buft);
43384425}
43394426
43404427ggml_backend_buffer_type_t ggml_backend_metal_buffer_type (void ) {
@@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
43574444static const char * ggml_backend_metal_buffer_from_ptr_type_get_name (ggml_backend_buffer_type_t buft) {
43584445 return " Metal_Mapped" ;
43594446
4360- UNUSED (buft);
4447+ GGML_UNUSED (buft);
43614448}
43624449
43634450static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type (void ) {
@@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004487 size_aligned += (size_page - (size_aligned % size_page));
44014488 }
44024489
4403- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4490+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4491+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
44044492
44054493 // the buffer fits into the max buffer size allowed by the device
44064494 if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534541 }
44544542 }
44554543
4544+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4545+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4546+ free (ctx);
4547+ ggml_backend_metal_device_rel (ctx_dev);
4548+ return NULL ;
4549+ }
4550+
44564551 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
44574552}
44584553
@@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44614556static const char * ggml_backend_metal_name (ggml_backend_t backend) {
44624557 return " Metal" ;
44634558
4464- UNUSED (backend);
4559+ GGML_UNUSED (backend);
44654560}
44664561
44674562static void ggml_backend_metal_free (ggml_backend_t backend) {
@@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664861 }
47674862 }
47684863
4864+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4865+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4866+ free (ctx);
4867+ ggml_backend_metal_device_rel (ctx_dev);
4868+ return NULL ;
4869+ }
4870+
47694871 return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
47704872}
47714873
@@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
47794881 return buft->iface .get_name == ggml_backend_metal_buffer_type_get_name ||
47804882 buft->iface .get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
47814883
4782- UNUSED (dev);
4884+ GGML_UNUSED (dev);
47834885}
47844886
47854887static bool ggml_backend_metal_device_offload_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
0 commit comments