datastax
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/__init__.py
Lines changed: 3 additions & 2 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/__init__.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/base.py
Lines changed: 362 additions & 0 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/base.py
Lines changed: 362 additions & 0 deletions
@@ -1,3 +1,4 @@
-from .knowledge_store import KnowledgeStore
+from .cassandra import CassandraKnowledgeStore
+from .base import KnowledgeStore
 
-__all__ = ["KnowledgeStore"]
+__all__ = ["CassandraKnowledgeStore", "KnowledgeStore"]
@@ -0,0 +1,362 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import (
+    Any,
+    AsyncIterable,
+    ClassVar,
+    Collection,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+)
+
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForRetrieverRun,
+    CallbackManagerForRetrieverRun,
+)
+from langchain_core.documents import Document
+from langchain_core.load import Serializable
+from langchain_core.runnables import run_in_executor
+from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
+from pydantic import Field
+
+
+def _has_next(iterator: Iterator) -> None:
+    """Checks if the iterator has more elements.
+    Warning: consumes an element from the iterator"""
+    sentinel = object()
+    return next(iterator, sentinel) is not sentinel
+
+
+class Node(Serializable):
+    """Node in the KnowledgeStore graph"""
+
+    id: Optional[str]
+    """Unique ID for the node. Shall be generated by the KnowledgeStore if not set"""
+    metadata: dict = Field(default_factory=dict)
+    """Metadata for the node. May contain information used to link this node 
+    with other nodes."""
+
+
+class TextNode(Node):
+    text: str
+    """Text contained by the node"""
+
+
+def _texts_to_nodes(
+    texts: Iterable[str],
+    metadatas: Optional[Iterable[dict]],
+    ids: Optional[Iterable[str]],
+) -> Iterator[Node]:
+    metadatas_it = iter(metadatas) if metadatas else None
+    ids_it = iter(ids) if ids else None
+    for text in texts:
+        try:
+            _metadata = next(metadatas_it) if metadatas_it else {}
+        except StopIteration:
+            raise ValueError("texts iterable longer than metadatas")
+        try:
+            _id = next(ids_it) if ids_it else None
+        except StopIteration:
+            raise ValueError("texts iterable longer than ids")
+        yield TextNode(
+            id=_id,
+            metadata=_metadata,
+            text=text,
+        )
+    if ids and _has_next(ids_it):
+        raise ValueError("ids iterable longer than texts")
+    if metadatas and _has_next(metadatas_it):
+        raise ValueError("metadatas iterable longer than texts")
+
+
+def _documents_to_nodes(
+    documents: Iterable[Document], ids: Optional[Iterable[str]]
+) -> Iterator[Node]:
+    ids_it = iter(ids) if ids else None
+    for doc in documents:
+        try:
+            _id = next(ids_it) if ids_it else None
+        except StopIteration:
+            raise ValueError("documents iterable longer than ids")
+        yield TextNode(
+            id=_id,
+            metadata=doc.metadata,
+            text=doc.page_content,
+        )
+    if ids and _has_next(ids_it):
+        raise ValueError("ids iterable longer than documents")
+
+
+class KnowledgeStore(VectorStore):
+    """A hybrid vector-and-graph knowledge store.
+
+    Document chunks support vector-similarity search as well as edges linking
+    chunks based on structural and semantic properties.
+    """
+
+    @abstractmethod
+    def add_nodes(
+        self,
+        nodes: Iterable[Node],
+        **kwargs: Any,
+    ) -> List[str]:
+        """Add nodes to the knowledge store
+
+        Args:
+            nodes: the nodes to add.
+        """
+
+    async def aadd_nodes(
+        self,
+        nodes: Iterable[Node],
+        **kwargs: Any,
+    ) -> List[str]:
+        """Add nodes to the knowledge store
+
+        Args:
+            nodes: the nodes to add.
+        """
+        return await run_in_executor(None, self.add_nodes, nodes, **kwargs)
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[Iterable[dict]] = None,
+        *,
+        ids: Optional[Iterable[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        nodes = _texts_to_nodes(texts, metadatas, ids)
+        return self.add_nodes(nodes, **kwargs)
+
+    async def aadd_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[Iterable[dict]] = None,
+        *,
+        ids: Optional[Iterable[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        nodes = _texts_to_nodes(texts, metadatas, ids)
+        return await self.aadd_nodes(nodes, **kwargs)
+
+    def add_documents(
+        self,
+        documents: Iterable[Document] = None,
+        *,
+        ids: Optional[Iterable[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        nodes = _documents_to_nodes(documents, ids)
+        return self.add_nodes(nodes, **kwargs)
+
+    async def aadd_documents(
+        self,
+        documents: Iterable[Document] = None,
+        *,
+        ids: Optional[Iterable[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        nodes = _documents_to_nodes(documents, ids)
+        return await self.aadd_nodes(nodes, **kwargs)
+
+    @abstractmethod
+    def traversing_retrieve(
+        self,
+        query: str,
+        *,
+        k: int = 4,
+        depth: int = 1,
+        **kwargs: Any,
+    ) -> Iterable[Document]:
+        """Retrieve documents from traversing this knowledge store.
+
+        First, `k` nodes are retrieved using a search for each `query` string.
+        Then, additional nodes are discovered up to the given `depth` from those
+        starting nodes.
+
+        Args:
+            query: The query string.
+            k: The number of Documents to return from the initial search.
+                Defaults to 4. Applies to each of the query strings.
+            depth: The maximum depth of edges to traverse. Defaults to 1.
+        Returns:
+            Retrieved documents.
+        """
+
+    async def atraversing_retrieve(
+        self,
+        query: str,
+        *,
+        k: int = 4,
+        depth: int = 1,
+        **kwargs: Any,
+    ) -> AsyncIterable[Document]:
+        """Retrieve documents from traversing this knowledge store.
+
+        First, `k` nodes are retrieved using a search for each `query` string.
+        Then, additional nodes are discovered up to the given `depth` from those
+        starting nodes.
+
+        Args:
+            query: The query string.
+            k: The number of Documents to return from the initial search.
+                Defaults to 4. Applies to each of the query strings.
+            depth: The maximum depth of edges to traverse. Defaults to 1.
+        Returns:
+            Retrieved documents.
+        """
+        for doc in await run_in_executor(
+            None, self.traversing_retrieve, query, k=k, depth=depth, **kwargs
+        ):
+            yield doc
+
+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        return list(self.traversing_retrieve(query, k=k, depth=0))
+
+    async def asimilarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        return [doc async for doc in self.atraversing_retrieve(query, k=k, depth=0)]
+
+    def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]:
+        if search_type == "similarity":
+            return self.similarity_search(query, **kwargs)
+        elif search_type == "similarity_score_threshold":
+            docs_and_similarities = self.similarity_search_with_relevance_scores(
+                query, **kwargs
+            )
+            return [doc for doc, _ in docs_and_similarities]
+        elif search_type == "mmr":
+            return self.max_marginal_relevance_search(query, **kwargs)
+        elif search_type == "traversal":
+            return list(self.traversing_retrieve(query, **kwargs))
+        else:
+            raise ValueError(
+                f"search_type of {search_type} not allowed. Expected "
+                "search_type to be 'similarity', 'similarity_score_threshold', "
+                "'mmr' or 'traversal'."
+            )
+
+    async def asearch(
+        self, query: str, search_type: str, **kwargs: Any
+    ) -> List[Document]:
+        if search_type == "similarity":
+            return await self.asimilarity_search(query, **kwargs)
+        elif search_type == "similarity_score_threshold":
+            docs_and_similarities = await self.asimilarity_search_with_relevance_scores(
+                query, **kwargs
+            )
+            return [doc for doc, _ in docs_and_similarities]
+        elif search_type == "mmr":
+            return await self.amax_marginal_relevance_search(query, **kwargs)
+        elif search_type == "traversal":
+            return [doc async for doc in self.atraversing_retrieve(query, **kwargs)]
+        else:
+            raise ValueError(
+                f"search_type of {search_type} not allowed. Expected "
+                "search_type to be 'similarity', 'similarity_score_threshold', "
+                "'mmr' or 'traversal'."
+            )
+
+    def as_retriever(self, **kwargs: Any) -> "KnowledgeStoreRetriever":
+        """Return KnowledgeStoreRetriever initialized from this KnowledgeStore.
+
+        Args:
+            search_type (Optional[str]): Defines the type of search that
+                the Retriever should perform.
+                Can be "traversal" (default), "similarity", "mmr", or
+                "similarity_score_threshold".
+            search_kwargs (Optional[Dict]): Keyword arguments to pass to the
+                search function. Can include things like:
+                    k: Amount of documents to return (Default: 4)
+                    depth: The maximum depth of edges to traverse (Default: 1)
+                    score_threshold: Minimum relevance threshold
+                        for similarity_score_threshold
+                    fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
+                    lambda_mult: Diversity of results returned by MMR;
+                        1 for minimum diversity and 0 for maximum. (Default: 0.5)
+        Returns:
+            Retriever for this KnowledgeStore.
+
+        Examples:
+
+        .. code-block:: python
+
+            # Retrieve documents traversing edges
+            docsearch.as_retriever(
+                search_type="traversal",
+                search_kwargs={'k': 6, 'depth': 3}
+            )
+
+            # Retrieve more documents with higher diversity
+            # Useful if your dataset has many similar documents
+            docsearch.as_retriever(
+                search_type="mmr",
+                search_kwargs={'k': 6, 'lambda_mult': 0.25}
+            )
+
+            # Fetch more documents for the MMR algorithm to consider
+            # But only return the top 5
+            docsearch.as_retriever(
+                search_type="mmr",
+                search_kwargs={'k': 5, 'fetch_k': 50}
+            )
+
+            # Only retrieve documents that have a relevance score
+            # Above a certain threshold
+            docsearch.as_retriever(
+                search_type="similarity_score_threshold",
+                search_kwargs={'score_threshold': 0.8}
+            )
+
+            # Only get the single most similar document from the dataset
+            docsearch.as_retriever(search_kwargs={'k': 1})
+
+        """
+        return KnowledgeStoreRetriever(vectorstore=self, **kwargs)
+
+
+class KnowledgeStoreRetriever(VectorStoreRetriever):
+    """Retriever class for KnowledgeStore."""
+
+    vectorstore: KnowledgeStore
+    """KnowledgeStore to use for retrieval."""
+    search_type: str = "traversal"
+    """Type of search to perform. Defaults to "traversal"."""
+    allowed_search_types: ClassVar[Collection[str]] = (
+        "similarity",
+        "similarity_score_threshold",
+        "mmr",
+        "traversal",
+    )
+
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        if self.search_type == "traversal":
+            return list(
+                self.vectorstore.traversing_retrieve(query, **self.search_kwargs)
+            )
+        else:
+            return super()._get_relevant_documents(query, run_manager=run_manager)
+
+    async def _aget_relevant_documents(
+        self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        if self.search_type == "traversal":
+            return [
+                doc
+                async for doc in self.vectorstore.atraversing_retrieve(
+                    query, **self.search_kwargs
+                )
+            ]
+        else:
+            return await super()._aget_relevant_documents(
+                query, run_manager=run_manager
+            )