Skip to content

Commit 2e4e8b1

Browse files
committed
disable pipeline parallelism when there are tensor overrides
1 parent 6693b0a commit 2e4e8b1

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

src/llama-context.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ llama_context::llama_context(
255255
model.n_devices() > 1 &&
256256
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
257257
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
258-
cparams.offload_kqv;
258+
cparams.offload_kqv &&
259+
!model.has_tensor_overrides();
259260

260261
// pipeline parallelism requires support for async compute and events in all devices
261262
if (pipeline_parallel) {

src/llama-model.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,9 +379,12 @@ struct llama_model::impl {
379379
layer_dev dev_input = {};
380380
layer_dev dev_output = {};
381381
std::vector<layer_dev> dev_layer;
382+
383+
bool has_tensor_overrides;
382384
};
383385

384386
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
387+
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
385388
}
386389

387390
llama_model::~llama_model() {}
@@ -4169,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
41694172
});
41704173
}
41714174

4175+
bool llama_model::has_tensor_overrides() const {
4176+
return pimpl->has_tensor_overrides;
4177+
}
4178+
41724179
const ggml_tensor * llama_model::get_tensor(const char * name) const {
41734180
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
41744181
[name](const std::pair<std::string, ggml_tensor *> & it) {

src/llama-model.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,8 @@ struct llama_model {
382382

383383
ggml_backend_buffer_type_t select_buft(int il) const;
384384

385+
bool has_tensor_overrides() const;
386+
385387
const struct ggml_tensor * get_tensor(const char * name) const;
386388

387389
// TODO: move this to new llm_arch_model_i interface

0 commit comments

Comments
 (0)