Skip to content

Commit ce05aa7

Browse files
committed
Merge commit '0bb2919335d00ff0bc79d5015da95c422de51f03' into concedo_experimental
# Conflicts: # ggml/src/CMakeLists.txt # src/llama-model.cpp
2 parents 61a7334 + 0bb2919 commit ce05aa7

File tree

1 file changed

+15
-27
lines changed

1 file changed

+15
-27
lines changed

src/llama-model.cpp

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
261261
return nullptr;
262262
}
263263

264-
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
264+
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
265265
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
266266
buft_list_t buft_list;
267267

@@ -277,32 +277,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
277277
}
278278
}
279279

280-
bool has_gpu_device = false;
281-
for (auto * dev : devices) {
282-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
283-
has_gpu_device = true;
284-
break;
285-
}
286-
}
287-
288-
// add extra buffer types, only if no GPU device is present
289-
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
290-
if (true) { //kcpp needs this to be true, otherwise 4_0_4_4 quants will break. avx2 repacking dont affect us cause we disabled it
291-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
292-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
293-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
294-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
295-
if (ggml_backend_dev_get_extra_bufts_fn) {
296-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
297-
while (extra_bufts && *extra_bufts) {
298-
buft_list.emplace_back(cpu_dev, *extra_bufts);
299-
++extra_bufts;
300-
}
301-
}
302-
} else {
303-
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
304-
}
305-
306280
// add a host buffer type
307281
// storing the tensors in a host buffer is useful when the processing of large batches
308282
// is offloaded to a GPU device, since it reduces the time spent on data transfers
@@ -317,6 +291,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
317291
}
318292
}
319293

294+
// add extra buffer types, only if no GPU device is present
295+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
296+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
297+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
298+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
299+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
300+
if (ggml_backend_dev_get_extra_bufts_fn) {
301+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
302+
while (extra_bufts && *extra_bufts) {
303+
buft_list.emplace_back(cpu_dev, *extra_bufts);
304+
++extra_bufts;
305+
}
306+
}
307+
320308
// add the CPU buffer type
321309
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
322310
ggml_backend_dev_t dev = ggml_backend_dev_get(i);

0 commit comments

Comments
 (0)