@@ -371,6 +371,8 @@ - (void) dealloc {
371371 id <MTLLibrary > library;
372372 id <MTLCommandQueue > queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
373373
374+ struct ggml_backend_metal_device_props props_dev;
375+
374376 dispatch_queue_t d_queue;
375377
376378 // the set of pre-compiled kernels for this context
@@ -481,6 +483,8 @@ @implementation GGMLMetalClass
481483 return NULL ;
482484 }
483485
486+ ctx->props_dev = ggml_backend_metal_device_get_props (dev->context );
487+
484488 ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
485489
486490 ctx->use_fusion = getenv (" GGML_METAL_FUSION_DISABLE" ) == nil ;
@@ -496,8 +500,6 @@ @implementation GGMLMetalClass
496500 ctx->debug_fusion = val ? atoi (val) : 0 ;
497501 }
498502
499- struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (dev->context );
500-
501503 ctx->use_graph_optimize = true ;
502504
503505 if (getenv (" GGML_METAL_GRAPH_OPTIMIZE_DISABLE" ) != NULL ) {
@@ -553,9 +555,9 @@ @implementation GGMLMetalClass
553555 GGML_LOG_WARN (" %s : skipping %-40s (not supported)\n " , __func__, " kernel_" #name); \
554556 }
555557
556- const bool has_simdgroup_mm = props .has_simdgroup_mm ;
557- const bool has_simdgroup_reduction = props .has_simdgroup_reduction ;
558- const bool use_bfloat = props .use_bfloat ;
558+ const bool has_simdgroup_mm = ctx-> props_dev .has_simdgroup_mm ;
559+ const bool has_simdgroup_reduction = ctx-> props_dev .has_simdgroup_reduction ;
560+ const bool use_bfloat = ctx-> props_dev .use_bfloat ;
559561
560562 // simd_sum and simd_max requires MTLGPUFamilyApple7
561563
@@ -1213,6 +1215,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
12131215 }
12141216 }
12151217
1218+ for (int i = 0 ; i < (int ) ctx->cmd_bufs_ext .count ; ++i) {
1219+ if (ctx->cmd_bufs_ext [i]) {
1220+ [ctx->cmd_bufs_ext[i] release ];
1221+ }
1222+ }
1223+
12161224 [ctx->cmd_bufs_ext removeAllObjects ];
12171225 [ctx->cmd_bufs_ext release ];
12181226
@@ -1337,12 +1345,10 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer
13371345 return nil ;
13381346}
13391347
1340- static bool ggml_metal_supports_op (ggml_backend_metal_device_t ctx_dev, const struct ggml_tensor * op) {
1341- const struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (ctx_dev);
1342-
1343- const bool has_simdgroup_mm = props.has_simdgroup_mm ;
1344- const bool has_simdgroup_reduction = props.has_simdgroup_reduction ;
1345- const bool use_bfloat = props.use_bfloat ;
1348+ static bool ggml_metal_supports_op (const struct ggml_backend_metal_device_props * props_dev, const struct ggml_tensor * op) {
1349+ const bool has_simdgroup_mm = props_dev->has_simdgroup_mm ;
1350+ const bool has_simdgroup_reduction = props_dev->has_simdgroup_reduction ;
1351+ const bool use_bfloat = props_dev->use_bfloat ;
13461352
13471353 if (!use_bfloat) {
13481354 if (op->type == GGML_TYPE_BF16) {
@@ -1607,8 +1613,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
16071613
16081614 id <MTLComputeCommandEncoder > encoder = ctx_enc->encoder ;
16091615
1610- struct ggml_backend_metal_context * ctx = backend->context ;
1611- struct ggml_backend_metal_device * ctx_dev = backend->device ->context ;
1616+ struct ggml_backend_metal_context * ctx = backend->context ;
16121617
16131618 struct ggml_cgraph * gf = ctx->gf ;
16141619
@@ -1642,7 +1647,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
16421647 } break ;
16431648 }
16441649
1645- if (!ggml_metal_supports_op (ctx_dev , dst)) {
1650+ if (!ggml_metal_supports_op (&ctx-> props_dev , dst)) {
16461651 GGML_LOG_ERROR (" %s : error: unsupported op '%s '\n " , __func__, ggml_op_desc (dst));
16471652 GGML_ABORT (" unsupported op" );
16481653 }
@@ -1753,8 +1758,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
17531758 }
17541759 }
17551760
1756- const struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (ctx_dev);
1757-
17581761 switch (dst->op ) {
17591762 case GGML_OP_CONCAT:
17601763 {
@@ -3132,7 +3135,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
31323135 } else
31333136 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
31343137 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
3135- if (props .supports_gpu_family_apple7 &&
3138+ if (ctx-> props_dev .supports_gpu_family_apple7 &&
31363139 !ggml_is_transposed (src0) &&
31373140 !ggml_is_transposed (src1) &&
31383141 src1t == GGML_TYPE_F32 &&
@@ -3470,7 +3473,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
34703473
34713474 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
34723475 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
3473- if (props .supports_gpu_family_apple7 &&
3476+ if (ctx-> props_dev .supports_gpu_family_apple7 &&
34743477 ne00 % 32 == 0 && ne00 >= 64 &&
34753478 (ne21 >= ne21_mm_id_min)) {
34763479 GGML_ASSERT (ne00 % 4 == 0 );
@@ -3519,7 +3522,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
35193522
35203523 const size_t smem = ne02*ne20*sizeof (uint16_t );
35213524
3522- GGML_ASSERT (smem <= props .max_theadgroup_memory_size );
3525+ GGML_ASSERT (smem <= ctx-> props_dev .max_theadgroup_memory_size );
35233526
35243527 [encoder setComputePipelineState: pipeline];
35253528 [encoder setBytes: &args length: sizeof (args) atIndex: 0 ];
@@ -4704,7 +4707,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
47044707 // nsgmax = 2;
47054708 // while (true) {
47064709 // const size_t smem = FATTN_SMEM(nsgmax);
4707- // if (smem > props .max_theadgroup_memory_size) {
4710+ // if (smem > ctx->props_dev .max_theadgroup_memory_size) {
47084711 // break;
47094712 // }
47104713 // nsgmax *= 2;
@@ -4772,8 +4775,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
47724775
47734776 [encoder setBuffer: id_dst offset: offs_dst atIndex: 6 ];
47744777
4775- // printf("smem: %zu, max: %zu, nsg = %d, ne02 = %d, ne12 = %d\n", smem, props .max_theadgroup_memory_size, (int) nsg, ne02, ne12);
4776- GGML_ASSERT (smem <= props .max_theadgroup_memory_size );
4778+ // printf("smem: %zu, max: %zu, nsg = %d, ne02 = %d, ne12 = %d\n", smem, ctx->props_dev .max_theadgroup_memory_size, (int) nsg, ne02, ne12);
4779+ GGML_ASSERT (smem <= ctx-> props_dev .max_theadgroup_memory_size );
47774780 [encoder setThreadgroupMemoryLength: smem atIndex: 0 ];
47784781 [encoder dispatchThreadgroups: MTLSizeMake ((ne01 + nqptg - 1 )/nqptg, ne02, ne03) threadsPerThreadgroup: MTLSizeMake (32 , nsg, 1 )];
47794782#undef FATTN_SMEM
@@ -4800,7 +4803,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
48004803 while (true ) {
48014804 const size_t smem = FATTN_SMEM (nsgmax);
48024805 // avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
4803- if (smem > props .max_theadgroup_memory_size /2 ) {
4806+ if (smem > ctx-> props_dev .max_theadgroup_memory_size /2 ) {
48044807 break ;
48054808 }
48064809 nsgmax *= 2 ;
@@ -4889,8 +4892,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in
48894892
48904893 const size_t smem = FATTN_SMEM (nsg);
48914894
4892- // printf("smem: %zu, max: %zu, nsg = %d, nsgmax = %d\n", smem, props .max_theadgroup_memory_size, (int) nsg, (int) nsgmax);
4893- GGML_ASSERT (smem <= props .max_theadgroup_memory_size );
4895+ // printf("smem: %zu, max: %zu, nsg = %d, nsgmax = %d\n", smem, ctx->props_dev .max_theadgroup_memory_size, (int) nsg, (int) nsgmax);
4896+ GGML_ASSERT (smem <= ctx-> props_dev .max_theadgroup_memory_size );
48944897
48954898 if (nwg == 1 ) {
48964899 // using 1 workgroup -> write the result directly into dst
@@ -5666,12 +5669,12 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
56665669 size_aligned += (size_page - (size_aligned % size_page));
56675670 }
56685671
5669- struct ggml_backend_metal_device * ctx_dev = ( struct ggml_backend_metal_device *) buft->device ->context ;
5672+ ggml_backend_metal_device_t ctx_dev = buft->device ->context ;
56705673
5671- const struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (ctx_dev);
5674+ const struct ggml_backend_metal_device_props props_dev = ggml_backend_metal_device_get_props (ctx_dev);
56725675
56735676 // allocate shared buffer if the device supports it and it is required by the buffer type
5674- if (props .use_shared_buffers && shared) {
5677+ if (props_dev .use_shared_buffers && shared) {
56755678 ctx->all_data = ggml_metal_host_malloc (size_aligned);
56765679 ctx->is_shared = true ;
56775680 } else {
@@ -5682,7 +5685,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
56825685 ctx->all_size = size_aligned;
56835686
56845687 ctx->device = ggml_backend_metal_device_get_device (ctx_dev);
5685- ctx->queue = ggml_backend_metal_device_get_queue (ctx_dev);
5688+ ctx->queue = ggml_backend_metal_device_get_queue (ctx_dev);
56865689
56875690 ctx->n_buffers = 1 ;
56885691
@@ -5691,7 +5694,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
56915694 ctx->buffers [0 ].metal = nil ;
56925695
56935696 if (size_aligned > 0 ) {
5694- if (props .use_shared_buffers ) {
5697+ if (props_dev .use_shared_buffers ) {
56955698 ctx->buffers [0 ].metal = [ctx->device newBufferWithBytesNoCopy: ctx->all_data
56965699 length: size_aligned
56975700 options: MTLResourceStorageModeShared
@@ -5712,7 +5715,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
57125715 return NULL ;
57135716 }
57145717
5715- ctx->use_residency_sets = props .use_residency_sets ;
5718+ ctx->use_residency_sets = props_dev .use_residency_sets ;
57165719
57175720 if (!ggml_backend_metal_buffer_rset_init (ctx)) {
57185721 GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
@@ -5771,7 +5774,7 @@ static size_t ggml_backend_metal_buffer_type_shared_get_alignment(ggml_backend_b
57715774}
57725775
57735776static size_t ggml_backend_metal_buffer_type_shared_get_max_size (ggml_backend_buffer_type_t buft) {
5774- return ggml_backend_metal_device_get_props ((( struct ggml_backend_metal_device *) buft->device ->context ) ).max_buffer_size ;
5777+ return ggml_backend_metal_device_get_props (buft->device ->context ).max_buffer_size ;
57755778}
57765779
57775780static size_t ggml_backend_metal_buffer_type_shared_get_alloc_size (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
@@ -5820,7 +5823,7 @@ static size_t ggml_backend_metal_buffer_type_private_get_alignment(ggml_backend_
58205823}
58215824
58225825static size_t ggml_backend_metal_buffer_type_private_get_max_size (ggml_backend_buffer_type_t buft) {
5823- return ggml_backend_metal_device_get_props ((( struct ggml_backend_metal_device *) buft->device ->context ) ).max_buffer_size ;
5826+ return ggml_backend_metal_device_get_props (buft->device ->context ).max_buffer_size ;
58245827}
58255828
58265829static size_t ggml_backend_metal_buffer_type_private_get_alloc_size (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
@@ -5870,7 +5873,7 @@ static size_t ggml_backend_metal_buffer_type_mapped_get_alignment(ggml_backend_b
58705873}
58715874
58725875static size_t ggml_backend_metal_buffer_type_mapped_get_max_size (ggml_backend_buffer_type_t buft) {
5873- return ggml_backend_metal_device_get_props ((( struct ggml_backend_metal_device *) buft->device ->context ) ).max_buffer_size ;
5876+ return ggml_backend_metal_device_get_props (buft->device ->context ).max_buffer_size ;
58745877}
58755878
58765879static size_t ggml_backend_metal_buffer_type_mapped_get_alloc_size (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
@@ -6235,15 +6238,11 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
62356238}
62366239
62376240static const char * ggml_backend_metal_device_get_description (ggml_backend_dev_t dev) {
6238- struct ggml_backend_metal_device * ctx_dev = (struct ggml_backend_metal_device *)dev->context ;
6239-
6240- return ggml_backend_metal_device_get_props (ctx_dev).name ;
6241+ return ggml_backend_metal_device_get_props (dev->context ).name ;
62416242}
62426243
62436244static void ggml_backend_metal_device_get_memory_ext (ggml_backend_dev_t dev, size_t * free, size_t * total) {
6244- struct ggml_backend_metal_device * ctx_dev = (struct ggml_backend_metal_device *)dev->context ;
6245-
6246- ggml_backend_metal_device_get_memory (ctx_dev, free, total);
6245+ ggml_backend_metal_device_get_memory (dev->context , free, total);
62476246}
62486247
62496248static enum ggml_backend_dev_type ggml_backend_metal_device_get_type (ggml_backend_dev_t dev) {
@@ -6291,11 +6290,9 @@ static ggml_backend_t ggml_backend_metal_device_init_ext(ggml_backend_dev_t dev,
62916290}
62926291
62936292static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type (ggml_backend_dev_t dev) {
6294- struct ggml_backend_metal_device * ctx_dev = dev->context ;
6295-
6296- const struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (ctx_dev);
6293+ const struct ggml_backend_metal_device_props props_dev = ggml_backend_metal_device_get_props (dev->context );
62976294
6298- return props .use_shared_buffers ? ggml_backend_metal_buffer_type_shared () : ggml_backend_metal_buffer_type_private ();
6295+ return props_dev .use_shared_buffers ? ggml_backend_metal_buffer_type_shared () : ggml_backend_metal_buffer_type_private ();
62996296}
63006297
63016298static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped (ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -6322,15 +6319,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
63226319 size_aligned += (size_page - (size_aligned % size_page));
63236320 }
63246321
6325- struct ggml_backend_metal_device * ctx_dev = (struct ggml_backend_metal_device *)dev->context ;
6326-
6327- ctx->device = ggml_backend_metal_device_get_device (ctx_dev);
6328- ctx->queue = ggml_backend_metal_device_get_queue (ctx_dev);
6322+ ctx->device = ggml_backend_metal_device_get_device (dev->context );
6323+ ctx->queue = ggml_backend_metal_device_get_queue (dev->context );
63296324
6330- const struct ggml_backend_metal_device_props props = ggml_backend_metal_device_get_props (ctx_dev );
6325+ const struct ggml_backend_metal_device_props props_dev = ggml_backend_metal_device_get_props (dev-> context );
63316326
63326327 // the buffer fits into the max buffer size allowed by the device
6333- if (size_aligned <= props .max_buffer_size ) {
6328+ if (size_aligned <= props_dev .max_buffer_size ) {
63346329 ctx->buffers [ctx->n_buffers].data = ptr;
63356330 ctx->buffers [ctx->n_buffers].size = size;
63366331 ctx->buffers [ctx->n_buffers].metal = nil ;
@@ -6351,8 +6346,8 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
63516346 // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
63526347 // one of the views
63536348 const size_t size_ovlp = ((max_tensor_size + size_page - 1 ) / size_page + 1 ) * size_page; // round-up 2 pages just in case
6354- const size_t size_step = props .max_buffer_size - size_ovlp;
6355- const size_t size_view = props .max_buffer_size ;
6349+ const size_t size_step = props_dev .max_buffer_size - size_ovlp;
6350+ const size_t size_view = props_dev .max_buffer_size ;
63566351
63576352 for (size_t i = 0 ; i < size; i += size_step) {
63586353 const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
@@ -6380,7 +6375,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
63806375 }
63816376 }
63826377
6383- ctx->use_residency_sets = props .use_residency_sets ;
6378+ ctx->use_residency_sets = props_dev .use_residency_sets ;
63846379
63856380 if (!ggml_backend_metal_buffer_rset_init (ctx)) {
63866381 GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
@@ -6392,9 +6387,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
63926387}
63936388
63946389static bool ggml_backend_metal_device_supports_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
6395- struct ggml_backend_metal_device * ctx_dev = dev->context ;
6390+ const struct ggml_backend_metal_device_props props_dev = ggml_backend_metal_device_get_props ( dev->context ) ;
63966391
6397- return ggml_metal_supports_op (ctx_dev , op);
6392+ return ggml_metal_supports_op (&props_dev , op);
63986393}
63996394
64006395static bool ggml_backend_metal_device_supports_buft (ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
0 commit comments