Skip to content

Commit 70ac64e

Browse files
author
firecoperana
committed
disable pipeline parallelism when tensor override present
1 parent c33f39d commit 70ac64e

File tree

4 files changed

+15
-3
lines changed

4 files changed

+15
-3
lines changed

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2653,7 +2653,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
26532653
auto mparams = llama_model_params_from_gpt_params(params);
26542654

26552655
llama_model * model = nullptr;
2656-
2656+
26572657
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
26582658
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
26592659
} else if (!params.model_url.empty()) {

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,6 +1237,10 @@ std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, in
12371237
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
12381238
}
12391239

1240+
void llama_model::set_tensor_overrides(const llama_model_params& params) {
1241+
tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
1242+
}
1243+
12401244
std::string llama_model_ftype_name(llama_ftype ftype) {
12411245
if (ftype & LLAMA_FTYPE_GUESSED) {
12421246
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";

src/llama-model.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,10 +305,18 @@ struct llama_model {
305305
// keep track of loaded lora adapters
306306
std::set<llama_lora_adapter *> lora_adapters;
307307

308+
bool tensor_overrides;
309+
308310
~llama_model();
309311

310312
// Not actually needed, but left in place for now
311313
size_t max_nodes() const { return 65536; }
314+
315+
bool has_tensor_overrides() const {
316+
return tensor_overrides;
317+
};
318+
319+
void set_tensor_overrides(const llama_model_params& params);
312320
};
313321

314322
struct llama_lora_weight {

src/llama.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3968,7 +3968,7 @@ struct llama_model * llama_load_model_from_file(
39683968
return true;
39693969
};
39703970
}
3971-
3971+
model->set_tensor_overrides(params);
39723972
// model->devices hold device indices that are used to offload
39733973
// use model->devices to determine offload device
39743974
// if no device is specified, all device are included
@@ -4478,7 +4478,7 @@ struct llama_context * llama_new_context_with_model(
44784478
llama_get_device_count(*model) > 1 &&
44794479
model->n_gpu_layers > (int)model->hparams.n_layer &&
44804480
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
4481-
params.offload_kqv;
4481+
params.offload_kqv && !model->has_tensor_overrides();
44824482
#ifndef GGML_USE_CUDA
44834483
// pipeline parallelism requires support for async compute and events
44844484
// currently this is only implemented in the CUDA backend

0 commit comments

Comments
 (0)