@@ -271,19 +271,32 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
271271        }
272272    }
273273
274-     // add extra buffer types
275-     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
276-     auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
277-     auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
278-         ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
279-     if (ggml_backend_dev_get_extra_bufts_fn) {
280-         ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
281-         while (extra_bufts && *extra_bufts) {
282-             buft_list.emplace_back(cpu_dev, *extra_bufts);
283-             ++extra_bufts;
274+     bool has_gpu_device = false;
275+     for (auto * dev : devices) {
276+         if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
277+             has_gpu_device = true;
278+             break;
284279        }
285280    }
286281
282+     // add extra buffer types, only if no GPU device is present
283+     // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
284+     if (!has_gpu_device) {
285+         auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
286+         auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
287+         auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
288+             ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
289+         if (ggml_backend_dev_get_extra_bufts_fn) {
290+             ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
291+             while (extra_bufts && *extra_bufts) {
292+                 buft_list.emplace_back(cpu_dev, *extra_bufts);
293+                 ++extra_bufts;
294+             }
295+         }
296+     } else {
297+         LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
298+     }
299+ 
287300    // add a host buffer type
288301    // storing the tensors in a host buffer is useful when the processing of large batches
289302    // is offloaded to a GPU device, since it reduces the time spent on data transfers
0 commit comments