datastax
diff --git a/‎libs/knowledge-store/README.md
Lines changed: 111 additions & 4 deletions b/‎libs/knowledge-store/README.md
Lines changed: 111 additions & 4 deletions
diff --git a/‎libs/knowledge-store/pyproject.toml
Lines changed: 2 additions & 2 deletions b/‎libs/knowledge-store/pyproject.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/concurrency.py
Lines changed: 30 additions & 8 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/concurrency.py
Lines changed: 30 additions & 8 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/content.py
Lines changed: 1 addition & 16 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/content.py
Lines changed: 1 addition & 16 deletions
@@ -2,10 +2,117 @@
 
 Hybrid Knowledge Store combining vector similarity and edges between chunks.
 
-## Documentation
+## Usage
 
-[DataStax RAGStack Documentation](https://docs.datastax.com/en/ragstack/docs/index.html)
+1. Pre-process your documents to populate `metadata` information.
+1. Create a Hybrid `KnowledgeStore` and add your LangChain `Document`s.
+1. Retrieve documents from the `KnowledgeStore`.
 
-[Quickstart](https://docs.datastax.com/en/ragstack/docs/quickstart.html)
+### Populate Metadata
 
-[Examples](https://docs.datastax.com/en/ragstack/docs/examples/index.html)
+The Knowledge Store makes use of the following metadata fields on each `Document`:
+
+- `content_id`: If assigned, this specifies the unique ID of the `Document`.
+  If not assigned, one will be generated.
+  This should be set if you may re-ingest the same document so that it is overwritten rather than being duplicated.
+- `parent_content_id`: If this `Document` is a chunk of a larger document, you may reference the parent content here.
+- `keywords`: A list of strings representing keywords present in this `Document`.
+- `hrefs`: A list of strings containing the URLs which this `Document` links to.
+- `urls`: A list of strings containing the URLs associated with this `Document`.
+  If one webpage is divided into multiple chunks, each chunk's `Document` would have the same URL.
+  One webpage may have multiple URLs if it is available in multiple ways.
+
+#### Keywords
+
+To link documents with common keywords, assign the `keywords` metadata of each `Document`.
+
+There are various ways to assign keywords to each `Document`, such as TF-IDF across the documents.
+One easy option is to use the [KeyBERT](https://maartengr.github.io/KeyBERT/index.html).
+
+Once installed with `pip install keybert`, you can add keywords to a list `documents` as follows:
+
+```python
+from keybert import KeyBERT
+
+kw_model = KeyBERT()
+keywords = kw_model.extract_keywords([doc.page_content for doc in pages],
+                                     stop_words='english')
+
+for (doc, kws) in zip(documents, keywords):
+    doc.metadata["keywords"] = [kw for (kw, _distance) in kws]
+```
+
+Rather than taking all the top keywords, you could also limit to those with less than a certain `_distance` to the document.
+
+#### Hyperlinks
+
+To capture hyperlinks, populate the `hrefs` and `urls` metadata fields of each `Document`.
+
+```python
+import re
+link_re = re.compile("href=\"([^\"]+)")
+for doc in documents:
+    doc.metadata["content_id"] = doc.metadata["source"]
+    doc.metadata["hrefs"] = list(link_re.findall(doc.page_content))
+    doc.metadata["urls"] = [doc.metadata["source"]]
+```
+
+### Store
+
+```python
+import cassio
+from langchain_openai import OpenAIEmbeddings
+from ragstack_knowledge_store import KnowledgeStore
+
+cassio.init(auto=True)
+
+knowledge_store = KnowledgeStore(embeddings=OpenAIEmbeddings())
+
+# Store the documents
+knowledge_store.add_documents(documents)
+```
+
+### Retrieve
+
+```python
+from langchain_openai import ChatOpenAI
+
+llm = ChatOpenAI(model="gpt-4o")
+
+# Retrieve and generate using the relevant snippets of the blog.
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+
+# Depth 0 - don't traverse edges. equivalent to vector-only.
+# Depth 1 - vector search plus 1 level of edges
+retriever = knowledge_store.as_retriever(k=4, depth=1)
+
+template = """You are a helpful technical support bot. You should provide complete answers explaining the options the user has available to address their problem. Answer the question based only on the following context:
+{context}
+
+Question: {question}
+"""
+prompt = ChatPromptTemplate.from_template(template)
+
+def format_docs(docs):
+    formatted = "\n\n".join(f"From {doc.metadata['content_id']}: {doc.page_content}" for doc in docs)
+    return formatted
+
+
+rag_chain = (
+    {"context": retriever | format_docs, "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+```
+
+## Development
+
+```shell
+poetry install --with=dev
+
+# Run Tests
+poetry run pytest
+```
@@ -23,9 +23,9 @@ langchain-community = "^0.2"
 ipykernel = "^6.29.4"
 langchain-openai = "^0.1.7"
 testcontainers = "~3.7.1"
-# https://github.com/psf/requests/issues/6707
-requests = "<=2.31.0"
+precisely = "^0.1.9"
 setuptools = "^70.0.0"
+python-dotenv = "^1.0.1"
 
 [build-system]
 requires = ["poetry-core"]
 
@@ -1,7 +1,7 @@
 import contextlib
 import threading
 from types import TracebackType
-from typing import Any, Optional, Tuple, Type
+from typing import Any, Callable, NamedTuple, Optional, Sequence, Tuple, Type
 
 from cassandra.cluster import ResponseFuture, Session
 from cassandra.query import PreparedStatement
@@ -15,26 +15,47 @@ def __init__(self, session: Session, *, concurrency: int = 20) -> None:
         self._semaphore = threading.Semaphore(concurrency)
         self._completion = threading.Condition()
 
+        self._pending = 0
+
         self._error = None
 
-    def _handle_result(self, _result: Any):
-        self._semaphore.release()
-        with self._completion:
-            self._completion.notify()
+    def _handle_result(self,
+                       result: Sequence[NamedTuple],
+                       future: ResponseFuture,
+                       callback: Optional[Callable[[Sequence[NamedTuple]], Any]]):
+        if callback is not None:
+            callback(result)
+
+        if future.has_more_pages:
+            future.start_fetching_next_page()
+        else:
+            self._semaphore.release()
+            with self._completion:
+                self._pending -= 1
+                if self._pending == 0:
+                    self._completion.notify()
 
     def _handle_error(self, error):
         with self._completion:
             self._error = error
             self._completion.notify()
 
-    def execute(self, query: PreparedStatement, parameters: Optional[Tuple] = None):
+    def execute(self,
+                query: PreparedStatement,
+                parameters: Optional[Tuple] = None,
+                callback: Optional[str] = None):
         with self._completion:
+            self._pending += 1
             if self._error is not None:
                 return
 
         self._semaphore.acquire()
         future: ResponseFuture = self._session.execute_async(query, parameters)
-        future.add_callbacks(self._handle_result, self._handle_error)
+        future.add_callbacks(self._handle_result, self._handle_error,
+                             callback_kwargs={
+                                 "future": future,
+                                 "callback": callback,
+                             })
 
     def __enter__(self) -> "ConcurrentQueries":
         return super().__enter__()
@@ -46,7 +67,8 @@ def __exit__(
         _exc_traceback: Optional[TracebackType],
     ) -> bool:
         with self._completion:
-            self._completion.wait()
+            while self._error is None and self._pending > 0:
+                self._completion.wait()
 
         if self._error is not None:
             raise self._error
 
@@ -1,8 +1,4 @@
 from enum import Enum
-from typing import Optional, Set
-
-from langchain_core.pydantic_v1 import BaseModel, Field
-
 
 class Kind(str, Enum):
     document = "document"
@@ -25,15 +21,4 @@ class Kind(str, Enum):
     """An image within a document."""
 
     table = "table"
-    """A table within a document."""
-
-
-class Content(BaseModel):
-    source_id: str
-    content_id: str
-    parent_id: Optional[str] = None
-    kind: Kind
-    keywords: Set[str] = Field(default_factory=set)
-    urls: Set[str] = Field(default_factory=set)
-    links: Set[str] = Field(default_factory=set)
-    text_content: Optional[str] = None
+    """A table within a document."""