@@ -3969,7 +3969,7 @@ struct llama_model * llama_load_model_from_file(
39693969 return true ;
39703970 };
39713971 }
3972-
3972+ model-> set_tensor_overrides (params);
39733973 // model->devices hold device indices that are used to offload
39743974 // use model->devices to determine offload device
39753975 // if no device is specified, all device are included
@@ -4479,7 +4479,7 @@ struct llama_context * llama_new_context_with_model(
44794479 llama_get_device_count (*model) > 1 &&
44804480 model->n_gpu_layers > (int )model->hparams .n_layer &&
44814481 model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
4482- params.offload_kqv ;
4482+ params.offload_kqv && !model-> has_tensor_overrides () ;
44834483#ifndef GGML_USE_CUDA
44844484 // pipeline parallelism requires support for async compute and events
44854485 // currently this is only implemented in the CUDA backend
@@ -4498,10 +4498,19 @@ struct llama_context * llama_new_context_with_model(
44984498 ggml_cgraph * gf = llm_build_context::llama_build_graph (*ctx, llama_batch_get_one (&token, n_tokens, n_past, 0 ), true );
44994499
45004500 // initialize scheduler with the worst-case graph
4501- if (!ggml_backend_sched_reserve (ctx->sched , gf)) {
4502- LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
4503- llama_free (ctx);
4504- return nullptr ;
4501+ bool gf_success = ggml_backend_sched_reserve (ctx->sched , gf);
4502+ if (!gf_success)
4503+ {
4504+ if (pipeline_parallel) {
4505+ LLAMA_LOG_WARN (" %s: compute buffer allocation failed, retrying without pipeline parallelism\n " , __func__);
4506+ ctx->sched = ggml_backend_sched_new (ctx->backends .data (), backend_buft.data (), ctx->backends .size (), max_nodes, false );
4507+ gf_success = ggml_backend_sched_reserve (ctx->sched , gf);
4508+ }
4509+ if (!gf_success) {
4510+ LLAMA_LOG_ERROR (" %s: failed to allocate compute buffers\n " , __func__);
4511+ llama_free (ctx);
4512+ return nullptr ;
4513+ }
45054514 }
45064515
45074516 for (size_t i = 0 ; i < ctx->backends .size (); i++) {
0 commit comments