From 07d4b292124fd13aeee70cfea3766cdd65451cf1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 09:12:27 +0300 Subject: [PATCH 1/4] test-thread-safety : each context uses a single sequence --- tests/test-thread-safety.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index d525b7430f9d9..3b8ec40d065cd 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -34,6 +34,12 @@ int main(int argc, char ** argv) { auto cparams = common_context_params_to_llama(params); + // each context has a single sequence + cparams.n_seq_max = 1; + + // prevent from launching too many threads + cparams.n_threads = std::min(std::max(2u, std::thread::hardware_concurrency()/params.n_parallel), cparams.n_threads); + int dev_count = ggml_backend_dev_count(); int gpu_dev_count = 0; for (int i = 0; i < dev_count; ++i) { From d90b20df3a43133663c5002ac55b1e1417444fa8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 09:23:51 +0300 Subject: [PATCH 2/4] embedding : handle --parallel argument ggml-ci --- examples/embedding/embedding.cpp | 4 ++++ src/llama-batch.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 40ff6483807ee..2a246c5f49e55 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -81,6 +81,10 @@ int main(int argc, char ** argv) { params.embedding = true; + if (params.n_parallel == 1) { + params.kv_unified = true; + } + // utilize the full context if (params.n_batch < params.n_ctx) { LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx); diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index a546063c0a7c8..8698d89acecb2 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -59,7 +59,7 @@ bool llama_batch_allocr::init( for (int32_t i = 0; i < batch.n_tokens; ++i) { for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) { if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) { - LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max); + LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max); return false; } } From d6233d62440679cc664e7faf3213a971fbd218da Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 09:34:59 +0300 Subject: [PATCH 3/4] save-load : handle -np 1 ggml-ci --- examples/embedding/embedding.cpp | 4 ++++ examples/save-load-state/save-load-state.cpp | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 2a246c5f49e55..9ae7e4dbb0592 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -81,7 +81,11 @@ int main(int argc, char ** argv) { params.embedding = true; + // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the + // --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache + // in order to support any number of prompts if (params.n_parallel == 1) { + LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__); params.kv_unified = true; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index db79588f1a5a4..1065ec6bb005a 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -15,6 +15,12 @@ int main(int argc, char ** argv) { return 1; } + if (params.n_parallel == 1) { + // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache + printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__); + params.kv_unified = true; + } + common_init(); if (params.n_predict < 0) { From 4e4c6a730c4dd4160624520c0263196877511aaa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Jul 2025 14:46:13 +0300 Subject: [PATCH 4/4] thread-safety : avoid overriding threads, reduce test case arg ggml-ci --- tests/CMakeLists.txt | 2 +- tests/test-thread-safety.cpp | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fc1557a2d4065..9658abf969dd2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -185,7 +185,7 @@ llama_build_and_test(test-json-partial.cpp) llama_build_and_test(test-log.cpp) llama_build_and_test(test-regex-partial.cpp) -llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4) +llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2) # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) if (NOT WIN32) diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index 3b8ec40d065cd..853495b00d9d2 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -37,9 +37,6 @@ int main(int argc, char ** argv) { // each context has a single sequence cparams.n_seq_max = 1; - // prevent from launching too many threads - cparams.n_threads = std::min(std::max(2u, std::thread::hardware_concurrency()/params.n_parallel), cparams.n_threads); - int dev_count = ggml_backend_dev_count(); int gpu_dev_count = 0; for (int i = 0; i < dev_count; ++i) {