cont : avoid embeddings_org

ggerganov · ggerganov · commit 807558239cb1 · 2025-06-16T13:40:20.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -965,7 +965,6 @@ extern "C" {
     LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
 
     // Set whether the context outputs embeddings or not
-    // Note: set to true only if the context was created with llama_context_params.embeddings = true
     LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 
     // Set whether to use causal attention or not
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -41,7 +41,6 @@ llama_context::llama_context(
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.defrag_thold     = params.defrag_thold;
     cparams.embeddings       = params.embeddings;
-    cparams.embeddings_org   = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
     cparams.no_perf          = params.no_perf;
@@ -82,12 +81,6 @@ llama_context::llama_context(
         }
     }
 
-    if (!cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
-        LLAMA_LOG_WARN("%s: pooling_type is set to %d but embeddings is set to false - disabling pooling\n", __func__, cparams.pooling_type);
-
-        cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-    }
-
     if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
         cparams.causal_attn = hparams.causal_attn;
     } else {
@@ -630,12 +623,6 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
 }
 
 void llama_context::set_embeddings(bool value) {
-    if (value && !cparams.embeddings_org) {
-        LLAMA_LOG_ERROR("%s: cannot enable embeddings for this context (%s)\n",
-                __func__, "https://github.com/ggml-org/llama.cpp/pull/14208");
-        return;
-    }
-
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
     cparams.embeddings = value;
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
@@ -27,7 +27,6 @@ struct llama_cparams {
     float defrag_thold;
 
     bool embeddings;
-    bool embeddings_org;
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -3389,7 +3389,7 @@ struct server_context {
 
         // pad the batch so that batch.n_tokens >= n_slots
         // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
-        if (llama_get_embeddings(ctx)) {
+        if (slot_batched->need_embd()) {
             const int n_slots = slots.size();
 
             if (batch.n_tokens < n_slots) {