Skip to content

Commit e072b20

Browse files
slarenggerganov
andauthored
ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in ggml_backend_sched (#17276)
* ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in ggml_backend_sched Enabled in ggml-ci for testing. * llama : update worst-case graph for unified cache * ci : disable op offload in some tests * fix spelling --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent c6f7a42 commit e072b20

File tree

8 files changed

+37
-20
lines changed

8 files changed

+37
-20
lines changed

ci/run.sh

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ sd=`dirname $0`
4545
cd $sd/../
4646
SRC=`pwd`
4747

48-
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
48+
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
4949

5050
if [ ! -z ${GG_BUILD_METAL} ]; then
5151
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -428,10 +428,10 @@ function gg_run_qwen3_0_6b {
428428

429429
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
430430

431-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
432-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
433-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
434-
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
431+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
432+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
433+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
434+
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
435435

436436
function check_ppl {
437437
qnt="$1"
@@ -523,8 +523,8 @@ function gg_run_embd_bge_small {
523523

524524
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
525525

526-
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527-
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
526+
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527+
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
528528

529529
set +e
530530
}
@@ -564,7 +564,7 @@ function gg_run_rerank_tiny {
564564
model_f16="${path_models}/ggml-model-f16.gguf"
565565

566566
# for this model, the SEP token is "</s>"
567-
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
567+
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
568568

569569
# sample output
570570
# rerank score 0: 0.029

examples/embedding/embedding.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,16 @@ int main(int argc, char ** argv) {
104104

105105
params.embedding = true;
106106

107+
// get max number of sequences per batch
108+
const int n_seq_max = llama_max_parallel_sequences();
109+
107110
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
108111
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
109112
// in order to support any number of prompts
110113
if (params.n_parallel == 1) {
111114
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
112115
params.kv_unified = true;
116+
params.n_parallel = n_seq_max;
113117
}
114118

115119
// utilize the full context
@@ -123,9 +127,6 @@ int main(int argc, char ** argv) {
123127
params.n_ubatch = params.n_batch;
124128
}
125129

126-
// get max number of sequences per batch
127-
const int n_seq_max = llama_max_parallel_sequences();
128-
129130
llama_backend_init();
130131
llama_numa_init(params.numa);
131132

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ endif()
183183
# ggml core
184184
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
185185
option(GGML_CPU "ggml: enable CPU backend" ON)
186+
option(GGML_SCHED_NO_REALLOC "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
186187

187188
# 3rd party libs / backends
188189
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)

ggml/src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,10 @@ if (GGML_BACKEND_DL)
221221
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
222222
endif()
223223

224+
if (GGML_SCHED_NO_REALLOC)
225+
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
226+
endif()
227+
224228
add_library(ggml
225229
ggml-backend-reg.cpp)
226230
add_library(ggml::ggml ALIAS ggml)

ggml/src/ggml-alloc.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -921,10 +921,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
921921
}
922922
if (realloc) {
923923
#ifndef NDEBUG
924-
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
925-
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
924+
{
925+
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
926+
if (cur_size > 0) {
927+
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
928+
__func__, ggml_backend_buft_name(galloc->bufts[i]),
929+
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
930+
}
931+
}
926932
#endif
927-
928933
ggml_vbuffer_free(galloc->buffers[i]);
929934
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
930935
if (galloc->buffers[i] == NULL) {

ggml/src/ggml-backend.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,14 +1395,20 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
13951395

13961396
// allocate graph
13971397
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1398+
#ifdef GGML_SCHED_NO_REALLOC
1399+
GGML_ABORT("%s: failed to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
1400+
#endif
1401+
1402+
#ifndef NDEBUG
1403+
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1404+
#endif
1405+
13981406
// the re-allocation may cause the split inputs to be moved to a different address
13991407
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
14001408
for (int i = 0; i < sched->n_backends; i++) {
14011409
ggml_backend_synchronize(sched->backends[i]);
14021410
}
1403-
#ifndef NDEBUG
1404-
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1405-
#endif
1411+
14061412
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
14071413
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
14081414
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);

src/llama-context.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ llama_context::llama_context(
300300

301301
cross.v_embd.clear();
302302

303-
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
303+
const uint32_t n_seqs = cparams.n_seq_max;
304304
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
305305

306306
// avoid reserving graphs with zero outputs - assume one output per sequence
@@ -543,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
543543
throw std::runtime_error("failed to initialize memory context");
544544
}
545545

546-
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
546+
const uint32_t n_seqs = cparams.n_seq_max;
547547
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
548548

549549
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ if (NOT WIN32)
196196
llama_build_and_test(test-arg-parser.cpp)
197197
endif()
198198

199-
if (NOT LLAMA_SANITIZE_ADDRESS)
199+
if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
200200
# TODO: repair known memory leaks
201201
llama_build_and_test(test-opt.cpp)
202202
endif()

0 commit comments

Comments
 (0)