server : add test that exercises embeddings with FA enabled

ggerganov · ggerganov · commit 997b1b42a455 · 2025-04-08T13:45:49.000+03:00
ggml-ci
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
@@ -49,6 +49,26 @@ def test_embedding_multiple():
         assert len(d['embedding']) > 1
 
 
+def test_embedding_multiple_with_fa():
+    server = ServerPreset.bert_bge_small_with_fa()
+    server.pooling = 'last'
+    server.start()
+    # one of these should trigger the FA branch (i.e. context size % 256 == 0)
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "a "*253,
+            "b "*254,
+            "c "*255,
+            "d "*256,
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 4
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
 @pytest.mark.parametrize(
     "input,is_multi_prompt",
     [
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
@@ -323,6 +323,21 @@ def bert_bge_small() -> ServerProcess:
         server.server_embeddings = True
         return server
 
+    @staticmethod
+    def bert_bge_small_with_fa() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 1024
+        server.n_batch = 300
+        server.n_ubatch = 300
+        server.n_slots = 2
+        server.fa = True
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
     @staticmethod
     def tinyllama_infill() -> ServerProcess:
         server = ServerProcess()
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -6721,8 +6721,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     ggml_vec_dot_t    const kq_vec_dot     = ggml_get_type_traits_cpu(k->type)->vec_dot;
     ggml_to_float_t   const v_to_float     = ggml_get_type_traits(v->type)->to_float;
 
-    GGML_ASSERT(                            q_to_vec_dot && "fattn: unsupported K-type");
-    GGML_ASSERT(v->type == GGML_TYPE_F32 || v_to_float   && "fattn: unsupported V-type");
+    GGML_ASSERT((                            q_to_vec_dot) && "fattn: unsupported K-type");
+    GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float  ) && "fattn: unsupported V-type");
 
     // loop over n_batch and n_head
     for (int ir = ir0; ir < ir1; ++ir) {
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -1345,6 +1345,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
         case GGML_OP_ARANGE:
             return true;
         case GGML_OP_FLASH_ATTN_EXT:
+            if (op->src[0]->ne[0] == 32) {
+                // head size == 32 (e.g. bert-bge-small)
+                // TODO: not sure if it is worth adding kernels for this size
+                return false;
+            }
             if (op->src[1]->type != op->src[2]->type) {
                 return false;
             }