@@ -376,6 +376,7 @@ - (void) dealloc {
376376
377377 // how many times a given op was fused
378378 uint64_t fuse_cnt[GGML_OP_COUNT];
379+
379380 // capture state
380381 bool capture_next_compute;
381382 bool capture_started;
@@ -490,7 +491,7 @@ - (void) dealloc {
490491
491492 ctx->cmd_buf_last = nil ;
492493
493- // load kernels
494+ // load default kernels
494495 {
495496 NSError * error = nil ;
496497
@@ -501,12 +502,12 @@ - (void) dealloc {
501502#define GGML_METAL_ADD_KERNEL (e, name, supported ) \
502503 if (supported) { \
503504 struct ggml_metal_kernel * kernel = &ctx->kernels [e]; \
504- id <MTLFunction > metal_function = [ctx->library newFunctionWithName: @" kernel_" #name]; \
505- kernel->pipeline = [ctx->device newComputePipelineStateWithFunction: metal_function error: &error]; \
505+ id <MTLFunction > function = [ctx->library newFunctionWithName: @" kernel_" #name]; \
506+ kernel->pipeline = [ctx->device newComputePipelineStateWithFunction: function error: &error]; \
506507 GGML_LOG_DEBUG (" %s : loaded %-40s %16p | th_max = %4d | th_width = %4d \n " , __func__, " kernel_" #name, (void *) kernel->pipeline , \
507508 (int ) kernel->pipeline .maxTotalThreadsPerThreadgroup , \
508509 (int ) kernel->pipeline .threadExecutionWidth ); \
509- [metal_function release ]; \
510+ [function release ]; \
510511 if (error) { \
511512 GGML_LOG_ERROR (" %s : error: load pipeline error: %s \n " , __func__, [[error description ] UTF8String ]); \
512513 return NULL ; \
@@ -1140,6 +1141,25 @@ static size_t ggml_metal_flash_attn_ext_extra_tmp(const struct ggml_tensor * op)
11401141static void ggml_metal_free (struct ggml_backend_metal_context * ctx) {
11411142 GGML_LOG_INFO (" %s : deallocating\n " , __func__);
11421143
1144+ for (int i = 0 ; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
1145+ if (ctx->cmd_bufs [i].obj ) {
1146+ [ctx->cmd_bufs[i].obj release ];
1147+ }
1148+
1149+ if (ctx->cmd_bufs [i].mem_ranges ) {
1150+ ggml_mem_ranges_free (ctx->cmd_bufs [i].mem_ranges );
1151+ }
1152+ }
1153+
1154+ for (int i = 0 ; i < (int ) ctx->cmd_bufs_ext .count ; ++i) {
1155+ if (ctx->cmd_bufs_ext [i]) {
1156+ [ctx->cmd_bufs_ext[i] release ];
1157+ }
1158+ }
1159+
1160+ [ctx->cmd_bufs_ext removeAllObjects ];
1161+ [ctx->cmd_bufs_ext release ];
1162+
11431163 for (int i = 0 ; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
11441164 [ctx->kernels[i].pipeline release ];
11451165 }
@@ -1165,25 +1185,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
11651185
11661186 // [ctx->queue release]; // [TAG_QUEUE_PER_BACKEND]
11671187
1168- for (int i = 0 ; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
1169- if (ctx->cmd_bufs [i].obj ) {
1170- [ctx->cmd_bufs[i].obj release ];
1171- }
1172-
1173- if (ctx->cmd_bufs [i].mem_ranges ) {
1174- ggml_mem_ranges_free (ctx->cmd_bufs [i].mem_ranges );
1175- }
1176- }
1177-
1178- for (int i = 0 ; i < (int ) ctx->cmd_bufs_ext .count ; ++i) {
1179- if (ctx->cmd_bufs_ext [i]) {
1180- [ctx->cmd_bufs_ext[i] release ];
1181- }
1182- }
1183-
1184- [ctx->cmd_bufs_ext removeAllObjects ];
1185- [ctx->cmd_bufs_ext release ];
1186-
11871188 dispatch_release (ctx->d_queue );
11881189
11891190 free (ctx);
@@ -5528,6 +5529,8 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
55285529
55295530// backend
55305531
5532+ static void ggml_backend_metal_synchronize (ggml_backend_t backend);
5533+
55315534static const char * ggml_backend_metal_name (ggml_backend_t backend) {
55325535 return " Metal" ;
55335536
@@ -5537,6 +5540,9 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
55375540static void ggml_backend_metal_free (ggml_backend_t backend) {
55385541 struct ggml_backend_metal_context * ctx = backend->context ;
55395542
5543+ // wait for any ongoing async operations to finish
5544+ ggml_backend_metal_synchronize (backend);
5545+
55405546 ggml_metal_free (ctx);
55415547
55425548 free (backend);
0 commit comments