@@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
290
290
}
291
291
292
292
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
293
- static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
293
+ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts ) {
294
294
buft_list_t buft_list;
295
295
296
296
// add ACCEL buffer types
@@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
319
319
}
320
320
}
321
321
322
- // add extra buffer types, only if no GPU device is present
323
- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
324
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
- if (cpu_dev == nullptr) {
326
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
- }
322
+ // add extra buffer types
323
+ if (use_extra_bufts) {
324
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
325
+ if (cpu_dev == nullptr) {
326
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
327
+ }
328
328
329
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
- if (ggml_backend_dev_get_extra_bufts_fn) {
333
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
- while (extra_bufts && *extra_bufts) {
335
- buft_list.emplace_back(cpu_dev, *extra_bufts);
336
- ++extra_bufts;
329
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
330
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
331
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
332
+ if (ggml_backend_dev_get_extra_bufts_fn) {
333
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
334
+ while (extra_bufts && *extra_bufts) {
335
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
336
+ ++extra_bufts;
337
+ }
337
338
}
338
339
}
339
340
@@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1839
1840
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
1840
1841
1841
1842
// build a list of buffer types for the CPU and GPU devices
1842
- pimpl->cpu_buft_list = make_cpu_buft_list(devices);
1843
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts );
1843
1844
for (auto * dev : devices) {
1844
1845
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
1845
1846
// add CPU buffer types as a fallback
@@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2044
2045
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
2045
2046
std::regex pattern(overrides->pattern);
2046
2047
if (std::regex_search(tensor_name, pattern)) {
2047
- buft = overrides->buft;
2048
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
2049
+ // when overriding to a CPU buffer, consider the extra buffer types
2050
+ buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
2051
+ } else {
2052
+ buft = overrides->buft;
2053
+ }
2054
+
2048
2055
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
2049
2056
tensor_name.c_str(),
2050
2057
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
@@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() {
17839
17846
/*.use_mmap =*/ true,
17840
17847
/*.use_mlock =*/ false,
17841
17848
/*.check_tensors =*/ false,
17849
+ /*.use_extra_bufts =*/ true,
17842
17850
};
17843
17851
17844
17852
#ifdef GGML_USE_METAL
0 commit comments