@@ -558,18 +558,27 @@ static bool ggml_metal_heap_resize(struct ggml_metal_heap * heap, size_t size) {
558558
559559 [desc release ];
560560
561+ // GGML_LOG_INFO("%s: resized heap to %zu\n", __func__, [heap->obj size]);
562+
561563 ggml_metal_heap_reset (heap);
562564
563565 return true ;
564566}
565567
566- static id <MTLBuffer > ggml_metal_heap_alloc (struct ggml_metal_heap * heap, size_t size) {
567- const size_t alignment = 1024 *1024 ;
568+ static id <MTLBuffer > ggml_metal_heap_alloc (struct ggml_metal_heap * heap, size_t size, bool no_alloc) {
569+ // note: this is probably more than needed, but just in case
570+ const size_t alignment = 1024 ;
568571
569572 const size_t size_aligned = GGML_PAD (size, alignment);
570573
574+ // GGML_LOG_INFO("%s: size = %zu, size_aligned = %zu, need = %zu, fail = %d\n", __func__, size, size_aligned, heap->need, heap->fail);
575+
571576 heap->need += size_aligned;
572577
578+ if (no_alloc) {
579+ return nil ;
580+ }
581+
573582 if (!heap->fail && size_aligned > [heap->obj maxAvailableSizeWithAlignment: alignment]) {
574583 heap->fail = 1 ;
575584 }
@@ -883,7 +892,7 @@ @implementation GGMLMetalClass
883892 for (int i = 0 ; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
884893 ctx->cmd_bufs [i].obj = nil ;
885894
886- // create 1MB heaps per command buffer
895+ // create initial small heaps per command buffer
887896 // these can be resized during compute when necessary
888897 ctx->cmd_bufs [i].heap = ggml_metal_heap_init (device, 32 );
889898 }
@@ -1624,17 +1633,19 @@ static bool ggml_metal_encode_node(
16241633 GGML_ABORT (" unsupported op" );
16251634 }
16261635
1636+ const bool no_alloc = no_compute;
1637+
1638+ // heap buffers for temporary data
16271639 id <MTLBuffer > h_src0 = nil ;
1640+
16281641 switch (dst->op ) {
16291642 case GGML_OP_SOFT_MAX:
16301643 {
1631- h_src0 = ggml_metal_heap_alloc (heap, ggml_nbytes (src0));
1632- if (!h_src0) {
1633- // GGML_LOG_ERROR("%s: failed to allocate buffer, idx = %4d, size = %8zu, need = %8zu, max available = %9zu, heap size = %9zu, heap used = %zu\n",
1634- // __func__, idx, ggml_nbytes(src0), heap->need, [heap->obj maxAvailableSizeWithAlignment:0], [heap->obj size], [heap->obj usedSize]);
1644+ h_src0 = ggml_metal_heap_alloc (heap, ggml_nbytes (src0), no_alloc );
1645+ if (!no_alloc && ! h_src0) {
1646+ GGML_LOG_ERROR (" %s : failed to allocate buffer, idx = %4d , size = %8zu , need = %8zu , max available = %9zu , heap size = %9zu , heap used = %zu , fail = %d \n " ,
1647+ __func__, idx, ggml_nbytes (src0), heap->need , [heap->obj maxAvailableSizeWithAlignment: 0 ], [heap->obj size ], [heap->obj usedSize ], heap-> fail );
16351648 return false ;
1636- } else {
1637- // GGML_LOG_ERROR("%s: allocated %zu\n", __func__, ggml_nbytes(src0));
16381649 }
16391650 } break ;
16401651 default :
@@ -4707,16 +4718,13 @@ static enum ggml_status ggml_metal_graph_compute(
47074718 // number of threads in addition to the main thread
47084719 const int n_cb = ctx->n_cb ;
47094720
4710- int n_try = 2 ;
4711-
47124721 // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
47134722 // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
47144723 // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
47154724 // each thread creates it's own command buffer and enqueues the ops in parallel
47164725 //
47174726 // tests on M1 Pro and M2 Ultra using LLaMA models, show that optimal values for n_cb are 1 or 2
47184727
4719- while (n_try-- > 0 ) {
47204728 @autoreleasepool {
47214729 ctx->gf = gf;
47224730
@@ -4834,55 +4842,6 @@ static enum ggml_status ggml_metal_graph_compute(
48344842 }
48354843 }
48364844
4837- bool retry = false ;
4838-
4839- // check heap statuses
4840- for (int i = 0 ; i <= n_cb; ++i) {
4841- struct ggml_metal_heap * heap = ctx->cmd_bufs [i].heap ;
4842-
4843- const size_t need = heap->need ;
4844-
4845- // printf("\nXXXXXXXXXXXXXXXXX cb %d, need = %zu, fail = %d, size = %zu\n", i, need, heap->fail, [heap->obj currentAllocatedSize]);
4846-
4847- if (heap->fail == 0 ) {
4848- ggml_metal_heap_reset (ctx->cmd_bufs [i].heap );
4849- [heap->obj setPurgeableState: MTLPurgeableStateEmpty ];
4850-
4851- continue ;
4852- }
4853-
4854- if (heap->fail == 2 ) {
4855- GGML_LOG_ERROR (" %s : command buffer %d , MTLHeap ran out of buffers, max = %d \n " , __func__, i, heap->n );
4856- return GGML_STATUS_ALLOC_FAILED;
4857- }
4858-
4859- if (heap->fail == 3 ) {
4860- GGML_LOG_ERROR (" %s : command buffer %d , MTLHeap failed to allocate buffer, max = %d \n " , __func__, i, heap->n );
4861- return GGML_STATUS_ALLOC_FAILED;
4862- }
4863-
4864- // GGML_LOG_INFO("%s: command buffer %d, MTLHeap need = %zu\n", __func__, i, need);
4865-
4866- if (!ggml_metal_heap_resize (heap, need)) {
4867- GGML_LOG_ERROR (" %s : failed to increase heap size to %zu \n " , __func__, need);
4868- return GGML_STATUS_ALLOC_FAILED;
4869- }
4870-
4871- retry = true ;
4872- }
4873-
4874- if (!retry) {
4875- break ;
4876- }
4877-
4878- // printf("XXXXXXXXXXXXXXXXXXXXXXX retry\n");
4879-
4880- if (n_try == 0 ) {
4881- GGML_LOG_ERROR (" %s : failed to allocate heap memory\n " , __func__);
4882- return GGML_STATUS_ALLOC_FAILED;
4883- }
4884- }
4885-
48864845 return GGML_STATUS_SUCCESS;
48874846}
48884847
@@ -5257,21 +5216,38 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
52575216
52585217 const bool should_capture = ctx->capture_next_compute ;
52595218
5260- bool no_compute = false ;
5219+ ggml_metal_heap_reset (heap) ;
52615220
52625221 for (int idx = node_start; idx < node_end; ++idx) {
5263- if (should_capture) {
5264- [encoder pushDebugGroup: [NSString stringWithCString: ggml_op_desc (ggml_graph_node (ctx->gf, idx)) encoding: NSUTF8StringEncoding]];
5265- }
5222+ ggml_metal_encode_node (backend, idx, encoder, heap, true );
5223+ }
5224+
5225+ bool can_compute = true ;
52665226
5267- const bool res = ggml_metal_encode_node (backend, idx, encoder, heap, no_compute);
5227+ if (heap->need > [heap->obj size ]) {
5228+ const size_t need = heap->need ;
52685229
5269- if (should_capture) {
5270- [encoder popDebugGroup ];
5230+ if (!ggml_metal_heap_resize (heap, need)) {
5231+ GGML_LOG_ERROR (" %s : failed to resize MTLHeap, need = %zu \n " , __func__, need);
5232+ can_compute = false ;
52715233 }
5234+ }
5235+
5236+ if (can_compute) {
5237+ for (int idx = node_start; idx < node_end; ++idx) {
5238+ if (should_capture) {
5239+ [encoder pushDebugGroup: [NSString stringWithCString: ggml_op_desc (ggml_graph_node (ctx->gf, idx)) encoding: NSUTF8StringEncoding]];
5240+ }
5241+
5242+ const bool res = ggml_metal_encode_node (backend, idx, encoder, heap, false );
52725243
5273- if (!res) {
5274- no_compute = true ;
5244+ if (should_capture) {
5245+ [encoder popDebugGroup ];
5246+ }
5247+
5248+ if (!res) {
5249+ break ;
5250+ }
52755251 }
52765252 }
52775253
0 commit comments