refactor

joein · joein · commit 0d5821df0f3a · 2025-04-09T15:47:38.000+03:00
diff --git a/fastembed/text/multitask_embedding.py b/fastembed/text/multitask_embedding.py
@@ -60,9 +60,14 @@ def _list_supported_models(cls) -> list[DenseModelDescription]:
         return supported_multitask_models
 
     def _preprocess_onnx_input(
-        self, onnx_input: dict[str, NumpyArray], **kwargs: Any
+        self,
+        onnx_input: dict[str, NumpyArray],
+        task_id: Optional[Union[int, Task]] = None,
+        **kwargs: Any,
     ) -> dict[str, NumpyArray]:
-        onnx_input["task_id"] = np.array(kwargs["task_id"], dtype=np.int64)
+        if task_id is None:
+            raise ValueError(f"task_id must be provided for JinaEmbeddingV3, got <{task_id}>")
+        onnx_input["task_id"] = np.array(task_id, dtype=np.int64)
         return onnx_input
 
     def embed(
@@ -73,18 +78,16 @@ def embed(
         task_id: Optional[int] = None,
         **kwargs: Any,
     ) -> Iterable[NumpyArray]:
-        kwargs["task_id"] = (
+        task_id = (
             task_id if task_id is not None else self.default_task_id
         )  # required for multiprocessing
-        yield from super().embed(documents, batch_size, parallel, **kwargs)
+        yield from super().embed(documents, batch_size, parallel, task_id=task_id, **kwargs)
 
     def query_embed(self, query: Union[str, Iterable[str]], **kwargs: Any) -> Iterable[NumpyArray]:
-        kwargs["task_id"] = self.QUERY_TASK
-        yield from super().embed(query, **kwargs)
+        yield from super().embed(query, task_id=self.QUERY_TASK, **kwargs)
 
     def passage_embed(self, texts: Iterable[str], **kwargs: Any) -> Iterable[NumpyArray]:
-        kwargs["task_id"] = self.PASSAGE_TASK
-        yield from super().embed(texts, **kwargs)
+        yield from super().embed(texts, task_id=self.PASSAGE_TASK, **kwargs)
 
 
 class JinaEmbeddingV3Worker(OnnxTextEmbeddingWorker):
@@ -94,15 +97,15 @@ def init_embedding(
         cache_dir: str,
         **kwargs: Any,
     ) -> JinaEmbeddingV3:
-        model = JinaEmbeddingV3(
+        return JinaEmbeddingV3(
             model_name=model_name,
             cache_dir=cache_dir,
             threads=1,
             **kwargs,
         )
-        return model
 
     def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, OnnxOutputContext]]:
+        self.model: JinaEmbeddingV3  # mypy complaints `self.model` does not have `default_task_id`
         for idx, batch in items:
             onnx_output = self.model.onnx_embed(batch, task_id=self.model.default_task_id)
             yield idx, onnx_output
diff --git a/tests/test_text_multitask_embeddings.py b/tests/test_text_multitask_embeddings.py
@@ -109,9 +109,25 @@ def test_single_embedding():
 
             canonical_vector = task["vectors"]
             assert np.allclose(
-                embeddings[: len(docs), : canonical_vector.shape[1]], canonical_vector, atol=1e-4
+                embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4
             ), model_desc.model
 
+        classification_embeddings = list(model.embed(documents=docs, task_id=Task.CLASSIFICATION))
+        classification_embeddings = np.stack(classification_embeddings, axis=0)
+
+        assert classification_embeddings.shape == (len(docs), dim)
+
+        model = TextEmbedding(model_name=model_name, task_id=Task.CLASSIFICATION)
+        default_embeddings = list(model.embed(documents=docs))
+        default_embeddings = np.stack(default_embeddings, axis=0)
+
+        assert default_embeddings.shape == (len(docs), dim)
+
+        assert np.allclose(
+            classification_embeddings,
+            default_embeddings,
+            atol=1e-4,
+        ), model_desc.model
         if is_ci:
             delete_model_cache(model.model._model_dir)
 
@@ -140,7 +156,7 @@ def test_single_embedding_query():
 
         canonical_vector = CANONICAL_VECTOR_VALUES[model_name][task_id]["vectors"]
         assert np.allclose(
-            embeddings[: len(docs), : canonical_vector.shape[1]], canonical_vector, atol=1e-4
+            embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4
         ), model_desc.model
 
         if is_ci:
@@ -172,7 +188,7 @@ def test_single_embedding_passage():
 
         canonical_vector = CANONICAL_VECTOR_VALUES[model_name][task_id]["vectors"]
         assert np.allclose(
-            embeddings[: len(docs), : canonical_vector.shape[1]], canonical_vector, atol=1e-4
+            embeddings[:, : canonical_vector.shape[1]], canonical_vector, atol=1e-4
         ), model_desc.model
 
         if is_ci: