Skip to content

Commit d6818d0

Browse files
authored
llama : allow other bufts when overriding to CPU, add --no-repack option (#14990)
1 parent e08a988 commit d6818d0

File tree

5 files changed

+39
-21
lines changed

5 files changed

+39
-21
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20952095
params.no_kv_offload = true;
20962096
}
20972097
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2098+
add_opt(common_arg(
2099+
{"-nr", "--no-repack"},
2100+
"disable weight repacking",
2101+
[](common_params & params) {
2102+
params.no_extra_bufts = true;
2103+
}
2104+
).set_env("LLAMA_ARG_NO_REPACK"));
20982105
add_opt(common_arg(
20992106
{"-ctk", "--cache-type-k"}, "TYPE",
21002107
string_format(

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11221122
mparams.use_mmap = params.use_mmap;
11231123
mparams.use_mlock = params.use_mlock;
11241124
mparams.check_tensors = params.check_tensors;
1125+
mparams.use_extra_bufts = !params.no_extra_bufts;
11251126

11261127
if (params.kv_overrides.empty()) {
11271128
mparams.kv_overrides = NULL;

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ struct common_params {
359359
bool warmup = true; // warmup run
360360
bool check_tensors = false; // validate tensor data
361361
bool no_op_offload = false; // globally disable offload host tensor operations to device
362+
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
362363

363364
bool single_turn = false; // single turn chat conversation
364365

include/llama.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,11 @@ extern "C" {
284284
const struct llama_model_kv_override * kv_overrides;
285285

286286
// Keep the booleans together to avoid misalignment during copy-by-value.
287-
bool vocab_only; // only load the vocabulary, no weights
288-
bool use_mmap; // use mmap if possible
289-
bool use_mlock; // force system to keep model in RAM
290-
bool check_tensors; // validate model tensor data
287+
bool vocab_only; // only load the vocabulary, no weights
288+
bool use_mmap; // use mmap if possible
289+
bool use_mlock; // force system to keep model in RAM
290+
bool check_tensors; // validate model tensor data
291+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
291292
};
292293

293294
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations

src/llama-model.cpp

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290290
}
291291

292292
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
293-
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
293+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
294294
buft_list_t buft_list;
295295

296296
// add ACCEL buffer types
@@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319319
}
320320
}
321321

322-
// add extra buffer types, only if no GPU device is present
323-
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325-
if (cpu_dev == nullptr) {
326-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
327-
}
322+
// add extra buffer types
323+
if (use_extra_bufts) {
324+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325+
if (cpu_dev == nullptr) {
326+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
327+
}
328328

329-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332-
if (ggml_backend_dev_get_extra_bufts_fn) {
333-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334-
while (extra_bufts && *extra_bufts) {
335-
buft_list.emplace_back(cpu_dev, *extra_bufts);
336-
++extra_bufts;
329+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332+
if (ggml_backend_dev_get_extra_bufts_fn) {
333+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334+
while (extra_bufts && *extra_bufts) {
335+
buft_list.emplace_back(cpu_dev, *extra_bufts);
336+
++extra_bufts;
337+
}
337338
}
338339
}
339340

@@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
18391840
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
18401841

18411842
// build a list of buffer types for the CPU and GPU devices
1842-
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1843+
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
18431844
for (auto * dev : devices) {
18441845
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
18451846
// add CPU buffer types as a fallback
@@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
20442045
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
20452046
std::regex pattern(overrides->pattern);
20462047
if (std::regex_search(tensor_name, pattern)) {
2047-
buft = overrides->buft;
2048+
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2049+
// when overriding to a CPU buffer, consider the extra buffer types
2050+
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2051+
} else {
2052+
buft = overrides->buft;
2053+
}
2054+
20482055
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
20492056
tensor_name.c_str(),
20502057
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() {
1783917846
/*.use_mmap =*/ true,
1784017847
/*.use_mlock =*/ false,
1784117848
/*.check_tensors =*/ false,
17849+
/*.use_extra_bufts =*/ true,
1784217850
};
1784317851

1784417852
#ifdef GGML_USE_METAL

0 commit comments

Comments
 (0)