server : fix incorrect usage of llama_get_embeddings()

ggerganov · ggerganov · commit 2f462d3c4a88 · 2025-06-16T21:49:34.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -965,6 +965,7 @@ extern "C" {
     LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
 
     // Set whether the context outputs embeddings or not
+    // TODO: rename to avoid confusion with llama_get_embeddings()
     LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 
     // Set whether to use causal attention or not
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -1933,7 +1933,7 @@ struct server_context {
     // also we cannot split if the pooling would require any past tokens
     bool can_split() const {
         return
-            !llama_get_embeddings(ctx) ||
+            !params_base.embedding ||
             (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
     }
 

Original file line number	Diff line number	Diff line change
`@@ -1933,7 +1933,7 @@ struct server_context {`
`1933`	`1933`	`// also we cannot split if the pooling would require any past tokens`
`1934`	`1934`	`bool can_split() const {`
`1935`	`1935`	`return`
`1936`		`- !llama_get_embeddings(ctx) \|\|`
	`1936`	`+ !params_base.embedding \|\|`
`1937`	`1937`	`(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);`
`1938`	`1938`	`}`
`1939`	`1939`