Skip to content

Commit fc10841

Browse files
revert ggml-backend changes, fix cpu context
1 parent ceea1b9 commit fc10841

File tree

4 files changed

+48
-52
lines changed

4 files changed

+48
-52
lines changed

ggml/include/ggml-backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ extern "C" {
314314
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315315
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316316

317-
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_buffer_type_t buft);
317+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
318+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318319

319320
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320321
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

ggml/src/ggml-backend.cpp

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -740,15 +740,6 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
740740
return -1;
741741
}
742742

743-
static int ggml_backend_sched_buft_id(ggml_backend_sched_t sched, ggml_backend_buffer_type_t buft) {
744-
for (int i = 0; i < sched->n_backends; i++) {
745-
if (sched->bufts[i] == buft) {
746-
return i;
747-
}
748-
}
749-
return -1;
750-
}
751-
752743
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
753744
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
754745
if (buffer == NULL) {
@@ -1802,15 +1793,20 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
18021793
return sched->backends[i];
18031794
}
18041795

1805-
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_buffer_type_t buft) {
1796+
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
18061797
GGML_ASSERT(sched);
1807-
int buft_index = ggml_backend_sched_buft_id(sched, buft);
1808-
if (buft_index == -1) {
1809-
return 0;
1810-
}
1811-
GGML_ASSERT(buft_index < sched->n_backends);
1798+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
1799+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1800+
1801+
return sched->bufts[backend_index];
1802+
}
1803+
1804+
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
1805+
GGML_ASSERT(sched);
1806+
int backend_index = ggml_backend_sched_backend_id(sched, backend);
1807+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
18121808

1813-
return ggml_gallocr_get_buffer_size(sched->galloc, buft_index);
1809+
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
18141810
}
18151811

18161812
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {

src/llama-context.cpp

Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "llama-context.h"
22

3+
#include "ggml-backend.h"
34
#include "llama-impl.h"
45
#include "llama-batch.h"
56
#include "llama-io.h"
@@ -374,8 +375,9 @@ llama_context::llama_context(
374375
}
375376

376377
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
377-
ggml_backend_buffer_type_t buft = backend_buft[i];
378-
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), buft);
378+
ggml_backend_t backend = backend_ptrs[i];
379+
ggml_backend_buffer_type_t buft = backend_buft[i];
380+
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
379381
if (size > 1) {
380382
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
381383
ggml_backend_buft_name(buft),
@@ -2029,26 +2031,23 @@ void llama_context::perf_reset() {
20292031
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
20302032
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
20312033

2032-
auto get_memory_breakdown = [&](ggml_backend_buffer_type_t buft) {
2033-
llama_memory_breakdown_data data;
2034-
data.model = model.memory_use(buft);
2035-
data.context = memory->memory_use(buft);
2036-
data.compute = ggml_backend_sched_get_buffer_size(sched.get(), buft);
2037-
return data;
2038-
};
2039-
20402034
for (const auto & backend_ptr : backends) {
2041-
ggml_backend_t backend = backend_ptr.get();
2042-
ggml_backend_dev_t dev = ggml_backend_get_device(backend);
2043-
2044-
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
2045-
ret[buft] = get_memory_breakdown(buft);
2046-
2047-
ggml_backend_buffer_type_t buft_host = ggml_backend_dev_host_buffer_type(dev);
2048-
if (!buft_host) {
2049-
continue;
2035+
ggml_backend_t backend = backend_ptr.get();
2036+
{ // memory allocated statically on device of the backend itself
2037+
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
2038+
ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
2039+
}
2040+
{ // memory allocated on host for backend
2041+
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(ggml_backend_get_device(backend));
2042+
if (ret.count(buft) != 0) {
2043+
continue; // multiple backends may use the same host buffer type
2044+
}
2045+
ret[buft] = {model.memory_use(buft), memory->memory_use(buft), 0};
20502046
}
2051-
ret[buft_host] = get_memory_breakdown(buft_host);
2047+
}
2048+
for (const auto & backend_ptr : backends) {
2049+
ggml_backend_t backend = backend_ptr.get();
2050+
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
20522051
}
20532052
return ret;
20542053
}
@@ -2808,10 +2807,9 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28082807
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
28092808

28102809
// "free" host memory is poorly defined, instead track only memory that we know is being used:
2811-
size_t model_host = 0;
2812-
size_t context_host = 0;
2813-
size_t compute_host = 0;
2814-
std::set<ggml_backend_buffer_type_t> seen_host_buffer_types; // track seen host buffer types to avoid double counting
2810+
llama_memory_breakdown_data mb_host_acc = memory_breakdown[ggml_backend_cpu_buffer_type()];
2811+
// track seen host buffer types to avoid double counting:
2812+
std::set<ggml_backend_buffer_type_t> seen_host_buffer_types = {ggml_backend_cpu_buffer_type()};
28152813

28162814
for (const ggml_backend_dev_t & dev : devices) {
28172815
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev);
@@ -2847,9 +2845,9 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28472845
if (seen_host_buffer_types.count(buft) == 1) {
28482846
continue;
28492847
}
2850-
model_host += mb.model;
2851-
context_host += mb.context;
2852-
compute_host += mb.compute;
2848+
mb_host_acc.model += mb.model;
2849+
mb_host_acc.context += mb.context;
2850+
mb_host_acc.compute += mb.compute;
28532851
seen_host_buffer_types.insert(buft);
28542852
}
28552853

@@ -2861,21 +2859,21 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) {
28612859
continue;
28622860
}
28632861
const llama_memory_breakdown_data & mb_host = memory_breakdown[buft_host];
2864-
model_host += mb_host.model;
2865-
context_host += mb_host.context;
2866-
compute_host += mb_host.compute;
2862+
mb_host_acc.model += mb_host.model;
2863+
mb_host_acc.context += mb_host.context;
2864+
mb_host_acc.compute += mb_host.compute;
28672865
seen_host_buffer_types.insert(buft_host);
28682866
}
2869-
const size_t self_host = model_host + context_host + compute_host;
2867+
const size_t self_host = mb_host_acc.model + mb_host_acc.context + mb_host_acc.compute;
28702868
table_data.push_back({
28712869
template_host,
28722870
" - Host",
28732871
"", // total
28742872
"", // free
28752873
std::to_string(self_host / MiB),
2876-
std::to_string(model_host / MiB),
2877-
std::to_string(context_host / MiB),
2878-
std::to_string(compute_host / MiB),
2874+
std::to_string(mb_host_acc.model / MiB),
2875+
std::to_string(mb_host_acc.context / MiB),
2876+
std::to_string(mb_host_acc.compute / MiB),
28792877
""}); // unaccounted
28802878

28812879
for (size_t j = 1; j < table_data[0].size(); j++) {

tools/mtmd/clip.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2812,8 +2812,9 @@ struct clip_model_loader {
28122812
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
28132813

28142814
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
2815+
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
28152816
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
2816-
size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), buft);
2817+
size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
28172818
if (size > 1) {
28182819
LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
28192820
ggml_backend_buft_name(buft),

0 commit comments

Comments
 (0)