ggml-org · ggerganov · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
@@ -81,6 +81,14 @@ int main(int argc, char ** argv) {
 
     params.embedding = true;
 
+    // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
+    //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
+    //   in order to support any number of prompts
+    if (params.n_parallel == 1) {
+        LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
+        params.kv_unified = true;
+    }
+
     // utilize the full context
     if (params.n_batch < params.n_ctx) {
         LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);

@@ -15,6 +15,12 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.n_parallel == 1) {
+        // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
+        printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
+        params.kv_unified = true;
+    }
+
     common_init();
 
     if (params.n_predict < 0) {

@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
         for (int32_t i = 0; i < batch.n_tokens; ++i) {
             for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
                 if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
+                    LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
                     return false;
                 }
             }

@@ -34,6 +34,12 @@ int main(int argc, char ** argv) {
 
     auto cparams = common_context_params_to_llama(params);
 
+    // each context has a single sequence
+    cparams.n_seq_max = 1;
+
+    // prevent from launching too many threads
+    cparams.n_threads = std::min<int>(std::max(2u, std::thread::hardware_concurrency()/params.n_parallel), cparams.n_threads);
+
     int dev_count = ggml_backend_dev_count();
     int gpu_dev_count = 0;
     for (int i = 0; i < dev_count; ++i) {