Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.no_kv_offload = true;
}
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg(
{"-nr", "--no-repack"},
"disable weight repacking",
[](common_params & params) {
params.no_extra_bufts = true;
}
).set_env("LLAMA_ARG_NO_REPACK"));
add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE",
string_format(
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;

if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ struct common_params {
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data
bool no_op_offload = false; // globally disable offload host tensor operations to device
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)

bool single_turn = false; // single turn chat conversation

Expand Down
9 changes: 5 additions & 4 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,10 +284,11 @@ extern "C" {
const struct llama_model_kv_override * kv_overrides;

// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
};

// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
Expand Down
42 changes: 25 additions & 17 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
}

// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts) {
buft_list_t buft_list;

// add ACCEL buffer types
Expand Down Expand Up @@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
}
}

// add extra buffer types, only if no GPU device is present
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
// add extra buffer types
if (use_extra_bufts) {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}

auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
}
}
}

Expand Down Expand Up @@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");

// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts);
for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback
Expand Down Expand Up @@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
if (std::regex_search(tensor_name, pattern)) {
buft = overrides->buft;
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
} else {
buft = overrides->buft;
}

LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
tensor_name.c_str(),
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
Expand Down Expand Up @@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() {
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
};

#ifdef GGML_USE_METAL
Expand Down
Loading