server : support both embeddings and completions with single model

ggerganov · ggerganov · commit aadc68b5f813 · 2025-06-16T13:18:29.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -41,6 +41,7 @@ llama_context::llama_context(
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.defrag_thold     = params.defrag_thold;
     cparams.embeddings       = params.embeddings;
+    cparams.embeddings_org   = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
     cparams.no_perf          = params.no_perf;
@@ -629,6 +630,12 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
 }
 
 void llama_context::set_embeddings(bool value) {
+    if (value && !cparams.embeddings_org) {
+        LLAMA_LOG_ERROR("%s: cannot enable embeddings for this context (%s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/14208");
+        return;
+    }
+
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
     cparams.embeddings = value;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -27,6 +27,7 @@ struct llama_cparams {
     float defrag_thold;
 
     bool embeddings;
+    bool embeddings_org;
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -88,6 +88,26 @@ enum error_type {
     ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 struct slot_params {
     bool stream        = true;
     bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
@@ -1330,6 +1350,14 @@ struct server_slot {
         n_draft_accepted = 0;
     }
 
+    bool need_embd() const {
+        return server_task_type_need_embd(task_type);
+    }
+
+    bool need_logits() const {
+        return server_task_type_need_logits(task_type);
+    }
+
     bool can_batch_with(server_slot & other_slot) const {
         return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
     }
@@ -3095,6 +3123,13 @@ struct server_context {
                             continue;
                         }
 
+                        // TODO: support memory-less logits computation
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
+                            slot.release();
+                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
+                            continue;
+                        }
+
                         if (!can_split()) {
                             if (slot.n_prompt_tokens > n_ubatch) {
                                 slot.release();
@@ -3292,7 +3327,7 @@ struct server_context {
                         }
 
                         // embedding requires all tokens in the batch to be output
-                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING || slot.task_type == SERVER_TASK_TYPE_RERANK;
+                        const bool need_embd = server_task_type_need_embd(slot.task_type);
 
                         common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
                         slot.cache_tokens.push_back(cur_tok);
@@ -3348,13 +3383,13 @@ struct server_context {
         if (slot_batched) {
             // apply lora, only need to do it once per batch
             common_set_adapter_lora(ctx, slot_batched->lora);
-        }
 
-        const bool do_encode = params_base.embedding;
+            llama_set_embeddings(ctx, slot_batched->need_embd());
+        }
 
         // pad the batch so that batch.n_tokens >= n_slots
         // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
-        if (do_encode) {
+        if (llama_get_embeddings(ctx)) {
             const int n_slots = slots.size();
 
             if (batch.n_tokens < n_slots) {
@@ -4175,11 +4210,6 @@ int main(int argc, char ** argv) {
             oaicompat_type oaicompat) -> void {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
         auto completion_id = gen_chatcmplid();
         std::unordered_set<int> task_ids;
         try {
@@ -4434,12 +4464,8 @@ int main(int argc, char ** argv) {
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
     };
 
-    const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         LOG_DBG("request: %s\n", req.body.c_str());
-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
 
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files;