@@ -261,7 +261,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
261261 return nullptr;
262262}
263263
264- // CPU: ACCEL -> CPU extra -> GPU host -> CPU
264+ // CPU: ACCEL -> GPU host -> CPU extra -> CPU
265265static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
266266 buft_list_t buft_list;
267267
@@ -277,32 +277,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
277277 }
278278 }
279279
280- bool has_gpu_device = false;
281- for (auto * dev : devices) {
282- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
283- has_gpu_device = true;
284- break;
285- }
286- }
287-
288- // add extra buffer types, only if no GPU device is present
289- // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
290- if (true) { //kcpp needs this to be true, otherwise 4_0_4_4 quants will break. avx2 repacking dont affect us cause we disabled it
291- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
292- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
293- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
294- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
295- if (ggml_backend_dev_get_extra_bufts_fn) {
296- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
297- while (extra_bufts && *extra_bufts) {
298- buft_list.emplace_back(cpu_dev, *extra_bufts);
299- ++extra_bufts;
300- }
301- }
302- } else {
303- LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
304- }
305-
306280 // add a host buffer type
307281 // storing the tensors in a host buffer is useful when the processing of large batches
308282 // is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -317,6 +291,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
317291 }
318292 }
319293
294+ // add extra buffer types, only if no GPU device is present
295+ // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
296+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
297+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
298+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
299+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
300+ if (ggml_backend_dev_get_extra_bufts_fn) {
301+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
302+ while (extra_bufts && *extra_bufts) {
303+ buft_list.emplace_back(cpu_dev, *extra_bufts);
304+ ++extra_bufts;
305+ }
306+ }
307+
320308 // add the CPU buffer type
321309 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
322310 ggml_backend_dev_t dev = ggml_backend_dev_get(i);
0 commit comments