diff --git a/integrations/isaacus/CHANGELOG.md b/integrations/isaacus/CHANGELOG.md new file mode 100644 index 000000000..c711e1038 --- /dev/null +++ b/integrations/isaacus/CHANGELOG.md @@ -0,0 +1,4 @@ +# Changelog + +## 0.1.0 +- Add Kanon2TextEmbedder and Kanon2DocumentEmbedder components. diff --git a/integrations/isaacus/README.md b/integrations/isaacus/README.md new file mode 100644 index 000000000..a02527061 --- /dev/null +++ b/integrations/isaacus/README.md @@ -0,0 +1,10 @@ +# isaacus-haystack + +- [Integration page](https://haystack.deepset.ai/integrations/isaacus) +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/isaacus/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). \ No newline at end of file diff --git a/integrations/isaacus/pyproject.toml b/integrations/isaacus/pyproject.toml new file mode 100644 index 000000000..1a4346727 --- /dev/null +++ b/integrations/isaacus/pyproject.toml @@ -0,0 +1,43 @@ +[project] +name = "isaacus-haystack" +version = "0.1.0" +description = "Kanon 2 (Isaacus) embedders for Haystack" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "Apache-2.0"} +authors = [{name = "Isaacus"}] +dependencies = [ + "haystack-ai>=2.14.0", + "requests>=2.31.0", +] + +[project.urls] +Homepage = "https://haystack.deepset.ai/integrations" +Documentation = "https://docs.isaacus.com/capabilities/embedding" + +[build-system] +requires = ["hatchling>=1.21.0"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.pytest.ini_options] +addopts = "-q" + +[tool.ruff] +line-length = 120 +select = ["E", "F", "I", "UP", "B", "PL"] + +[tool.mypy] +ignore_missing_imports = true + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "haystack-ai>=2.14.0", + "requests>=2.31.0", +] + +[tool.hatch.envs.test.scripts] +all = "pytest -q" diff --git a/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/__init__.py b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/__init__.py new file mode 100644 index 000000000..9a6c8a0fb --- /dev/null +++ b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/__init__.py @@ -0,0 +1,4 @@ +from .text_embedder import IsaacusTextEmbedder +from .document_embedder import IsaacusDocumentEmbedder + +__all__ = ["IsaacusTextEmbedder", "IsaacusDocumentEmbedder"] \ No newline at end of file diff --git a/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/document_embedder.py b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/document_embedder.py new file mode 100644 index 000000000..b29f12661 --- /dev/null +++ b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/document_embedder.py @@ -0,0 +1,57 @@ +from __future__ import annotations +from typing import List, Optional +from haystack import component +from haystack.dataclasses import Document +from haystack.utils import Secret +from .utils import IsaacusClient + + +@component +class IsaacusDocumentEmbedder: + """ + Embeds a list of Haystack `Document`s using Isaacus (configurable model). + Writes vectors to `document.embedding` and returns the list under `documents`. + + Parameters mirror IsaacusTextEmbedder, with an additional `batch_size`. + """ + + def __init__( + self, + *, + api_key: Secret = Secret.from_env_var("ISAACUS_API_KEY"), + base_url: str = "https://api.isaacus.com/v1", + model: str = "kanon-2-embedder", + task: str = "retrieval/document", + dimensions: Optional[int] = None, + overflow_strategy: Optional[str] = "drop_end", + batch_size: int = 128, + timeout: int = 30, + ): + self._client = IsaacusClient(api_key.resolve_value(), base_url, timeout) + self.model = model + self.task = task + self.dimensions = dimensions + self.overflow_strategy = overflow_strategy + self.batch_size = max(1, min(128, batch_size)) + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + if not documents: + return {"documents": []} + + # Only embed non-empty docs + docs = [d for d in documents if (d.content or "").strip()] + + for i in range(0, len(docs), self.batch_size): + batch = docs[i : i + self.batch_size] + vectors = self._client.embeddings_create( + model=self.model, + texts=[d.content for d in batch], + task=self.task, + dimensions=self.dimensions, + overflow_strategy=self.overflow_strategy, + ) + for d, v in zip(batch, vectors): + d.embedding = v + + return {"documents": documents} diff --git a/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/text_embedder.py b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/text_embedder.py new file mode 100644 index 000000000..a963f9768 --- /dev/null +++ b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/text_embedder.py @@ -0,0 +1,60 @@ +from __future__ import annotations +from typing import List, Optional +from haystack import component +from haystack.utils import Secret +from .utils import IsaacusClient + + +@component +class IsaacusTextEmbedder: + """ + Embeds a text string into a vector using Isaacus (configurable model). + Returns a single vector under the key `embedding`. + + Parameters + ---------- + api_key : Secret + Isaacus API key (default reads ISAACUS_API_KEY env var). + base_url : str + Isaacus API base URL. + model : str + Embedding model name (e.g., "kanon-2-embedder"). + task : str + Embedding task name ("retrieval/query" by default for queries). + dimensions : Optional[int] + Optional output dimensionality (e.g., 1792, 1024, 768...). + overflow_strategy : Optional[str] + Truncation strategy for long inputs (e.g., "drop_end"). + timeout : int + HTTP timeout in seconds. + """ + + def __init__( + self, + *, + api_key: Secret = Secret.from_env_var("ISAACUS_API_KEY"), + base_url: str = "https://api.isaacus.com/v1", + model: str = "kanon-2-embedder", + task: str = "retrieval/query", + dimensions: Optional[int] = None, + overflow_strategy: Optional[str] = "drop_end", + timeout: int = 30, + ): + self._client = IsaacusClient(api_key.resolve_value(), base_url, timeout) + self.model = model + self.task = task + self.dimensions = dimensions + self.overflow_strategy = overflow_strategy + + @component.output_types(embedding=List[float]) + def run(self, text: str): + if not text or not text.strip(): + return {"embedding": []} + vectors = self._client.embeddings_create( + model=self.model, + texts=[text], + task=self.task, + dimensions=self.dimensions, + overflow_strategy=self.overflow_strategy, + ) + return {"embedding": vectors[0]} diff --git a/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/utils.py b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/utils.py new file mode 100644 index 000000000..c6afaa30d --- /dev/null +++ b/integrations/isaacus/src/haystack_integrations/components/embedders/isaacus/utils.py @@ -0,0 +1,39 @@ +from __future__ import annotations +from typing import Any, Dict, List, Optional +import requests + + +class IsaacusClient: + def __init__(self, api_key: str, base_url: str = "https://api.isaacus.com/v1", timeout: int = 30): + self.api_key = api_key + self.base_url = base_url.rstrip("/") + self.timeout = timeout + + def embeddings_create( + self, + *, + model: str, + texts: List[str], + task: Optional[str] = None, + dimensions: Optional[int] = None, + overflow_strategy: Optional[str] = None, + extra_headers: Optional[Dict[str, str]] = None, + ) -> List[List[float]]: + url = f"{self.base_url}/embeddings" + payload: Dict[str, Any] = {"model": model, "texts": texts} + if task: + payload["task"] = task + if dimensions is not None: + payload["dimensions"] = int(dimensions) + if overflow_strategy: + payload["overflow_strategy"] = overflow_strategy + + headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"} + if extra_headers: + headers.update(extra_headers) + + resp = requests.post(url, json=payload, headers=headers, timeout=self.timeout) + resp.raise_for_status() + data = resp.json() + items = data.get("embeddings", []) + return [it["embedding"] for it in items] diff --git a/integrations/isaacus/tests/test_kanon2_embedder.py b/integrations/isaacus/tests/test_kanon2_embedder.py new file mode 100644 index 000000000..859174962 --- /dev/null +++ b/integrations/isaacus/tests/test_kanon2_embedder.py @@ -0,0 +1,37 @@ +from __future__ import annotations +from unittest.mock import patch +from haystack.dataclasses import Document +from haystack.utils import Secret +from haystack_integrations.components.embedders.isaacus import ( + IsaacusTextEmbedder, + IsaacusDocumentEmbedder, +) + + +def _fake_post(*_args, **kwargs): + class _Resp: + def raise_for_status(self): ... + def json(self): + texts = kwargs.get("json", {}).get("texts", []) + return {"embeddings": [{"embedding": [float(len(t))] * 4} for t in texts]} + return _Resp() + + +def test_text_embedder_runs_and_returns_vector(): + with patch("requests.post", _fake_post): + emb = IsaacusTextEmbedder(api_key=Secret.from_token("x"), model="kanon-2-embedder") + out = emb.run("hello") + assert "embedding" in out and isinstance(out["embedding"], list) + assert len(out["embedding"]) == 4 + + +def test_document_embedder_sets_embeddings_on_documents(): + with patch("requests.post", _fake_post): + docs = [Document(content="a"), Document(content="bb"), Document(content="")] + emb = IsaacusDocumentEmbedder(api_key=Secret.from_token("x"), batch_size=2, model="kanon-2-embedder") + out = emb.run(docs) + docs2 = out["documents"] + assert isinstance(docs2[0].embedding, list) and len(docs2[0].embedding) == 4 + assert isinstance(docs2[1].embedding, list) and len(docs2[1].embedding) == 4 + # empty doc keeps embedding as None/falsy + assert not docs2[2].embedding \ No newline at end of file