Skip to content

Commit 02b21cb

Browse files
authored
feat: Pull in knowledge store improvements (#446)
- Improved concurrency of keyword linking. - HREF->URL linking. - Update README with more details on how to use. Other changes: - Use `python-dotenv` to load `.env` to simplify test execution. - Remove pinned `requests` since `docker` has fixed the issue.
1 parent 8db06d9 commit 02b21cb

File tree

7 files changed

+294
-60
lines changed

7 files changed

+294
-60
lines changed

libs/knowledge-store/README.md

Lines changed: 111 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,117 @@
22

33
Hybrid Knowledge Store combining vector similarity and edges between chunks.
44

5-
## Documentation
5+
## Usage
66

7-
[DataStax RAGStack Documentation](https://docs.datastax.com/en/ragstack/docs/index.html)
7+
1. Pre-process your documents to populate `metadata` information.
8+
1. Create a Hybrid `KnowledgeStore` and add your LangChain `Document`s.
9+
1. Retrieve documents from the `KnowledgeStore`.
810

9-
[Quickstart](https://docs.datastax.com/en/ragstack/docs/quickstart.html)
11+
### Populate Metadata
1012

11-
[Examples](https://docs.datastax.com/en/ragstack/docs/examples/index.html)
13+
The Knowledge Store makes use of the following metadata fields on each `Document`:
14+
15+
- `content_id`: If assigned, this specifies the unique ID of the `Document`.
16+
If not assigned, one will be generated.
17+
This should be set if you may re-ingest the same document so that it is overwritten rather than being duplicated.
18+
- `parent_content_id`: If this `Document` is a chunk of a larger document, you may reference the parent content here.
19+
- `keywords`: A list of strings representing keywords present in this `Document`.
20+
- `hrefs`: A list of strings containing the URLs which this `Document` links to.
21+
- `urls`: A list of strings containing the URLs associated with this `Document`.
22+
If one webpage is divided into multiple chunks, each chunk's `Document` would have the same URL.
23+
One webpage may have multiple URLs if it is available in multiple ways.
24+
25+
#### Keywords
26+
27+
To link documents with common keywords, assign the `keywords` metadata of each `Document`.
28+
29+
There are various ways to assign keywords to each `Document`, such as TF-IDF across the documents.
30+
One easy option is to use the [KeyBERT](https://maartengr.github.io/KeyBERT/index.html).
31+
32+
Once installed with `pip install keybert`, you can add keywords to a list `documents` as follows:
33+
34+
```python
35+
from keybert import KeyBERT
36+
37+
kw_model = KeyBERT()
38+
keywords = kw_model.extract_keywords([doc.page_content for doc in pages],
39+
stop_words='english')
40+
41+
for (doc, kws) in zip(documents, keywords):
42+
doc.metadata["keywords"] = [kw for (kw, _distance) in kws]
43+
```
44+
45+
Rather than taking all the top keywords, you could also limit to those with less than a certain `_distance` to the document.
46+
47+
#### Hyperlinks
48+
49+
To capture hyperlinks, populate the `hrefs` and `urls` metadata fields of each `Document`.
50+
51+
```python
52+
import re
53+
link_re = re.compile("href=\"([^\"]+)")
54+
for doc in documents:
55+
doc.metadata["content_id"] = doc.metadata["source"]
56+
doc.metadata["hrefs"] = list(link_re.findall(doc.page_content))
57+
doc.metadata["urls"] = [doc.metadata["source"]]
58+
```
59+
60+
### Store
61+
62+
```python
63+
import cassio
64+
from langchain_openai import OpenAIEmbeddings
65+
from ragstack_knowledge_store import KnowledgeStore
66+
67+
cassio.init(auto=True)
68+
69+
knowledge_store = KnowledgeStore(embeddings=OpenAIEmbeddings())
70+
71+
# Store the documents
72+
knowledge_store.add_documents(documents)
73+
```
74+
75+
### Retrieve
76+
77+
```python
78+
from langchain_openai import ChatOpenAI
79+
80+
llm = ChatOpenAI(model="gpt-4o")
81+
82+
# Retrieve and generate using the relevant snippets of the blog.
83+
from langchain_core.runnables import RunnablePassthrough
84+
from langchain_core.output_parsers import StrOutputParser
85+
from langchain_core.prompts import ChatPromptTemplate
86+
87+
# Depth 0 - don't traverse edges. equivalent to vector-only.
88+
# Depth 1 - vector search plus 1 level of edges
89+
retriever = knowledge_store.as_retriever(k=4, depth=1)
90+
91+
template = """You are a helpful technical support bot. You should provide complete answers explaining the options the user has available to address their problem. Answer the question based only on the following context:
92+
{context}
93+
94+
Question: {question}
95+
"""
96+
prompt = ChatPromptTemplate.from_template(template)
97+
98+
def format_docs(docs):
99+
formatted = "\n\n".join(f"From {doc.metadata['content_id']}: {doc.page_content}" for doc in docs)
100+
return formatted
101+
102+
103+
rag_chain = (
104+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
105+
| prompt
106+
| llm
107+
| StrOutputParser()
108+
)
109+
```
110+
111+
## Development
112+
113+
```shell
114+
poetry install --with=dev
115+
116+
# Run Tests
117+
poetry run pytest
118+
```

libs/knowledge-store/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ langchain-community = "^0.2"
2323
ipykernel = "^6.29.4"
2424
langchain-openai = "^0.1.7"
2525
testcontainers = "~3.7.1"
26-
# https://github.com/psf/requests/issues/6707
27-
requests = "<=2.31.0"
26+
precisely = "^0.1.9"
2827
setuptools = "^70.0.0"
28+
python-dotenv = "^1.0.1"
2929

3030
[build-system]
3131
requires = ["poetry-core"]

libs/knowledge-store/ragstack_knowledge_store/concurrency.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import contextlib
22
import threading
33
from types import TracebackType
4-
from typing import Any, Optional, Tuple, Type
4+
from typing import Any, Callable, NamedTuple, Optional, Sequence, Tuple, Type
55

66
from cassandra.cluster import ResponseFuture, Session
77
from cassandra.query import PreparedStatement
@@ -15,26 +15,47 @@ def __init__(self, session: Session, *, concurrency: int = 20) -> None:
1515
self._semaphore = threading.Semaphore(concurrency)
1616
self._completion = threading.Condition()
1717

18+
self._pending = 0
19+
1820
self._error = None
1921

20-
def _handle_result(self, _result: Any):
21-
self._semaphore.release()
22-
with self._completion:
23-
self._completion.notify()
22+
def _handle_result(self,
23+
result: Sequence[NamedTuple],
24+
future: ResponseFuture,
25+
callback: Optional[Callable[[Sequence[NamedTuple]], Any]]):
26+
if callback is not None:
27+
callback(result)
28+
29+
if future.has_more_pages:
30+
future.start_fetching_next_page()
31+
else:
32+
self._semaphore.release()
33+
with self._completion:
34+
self._pending -= 1
35+
if self._pending == 0:
36+
self._completion.notify()
2437

2538
def _handle_error(self, error):
2639
with self._completion:
2740
self._error = error
2841
self._completion.notify()
2942

30-
def execute(self, query: PreparedStatement, parameters: Optional[Tuple] = None):
43+
def execute(self,
44+
query: PreparedStatement,
45+
parameters: Optional[Tuple] = None,
46+
callback: Optional[str] = None):
3147
with self._completion:
48+
self._pending += 1
3249
if self._error is not None:
3350
return
3451

3552
self._semaphore.acquire()
3653
future: ResponseFuture = self._session.execute_async(query, parameters)
37-
future.add_callbacks(self._handle_result, self._handle_error)
54+
future.add_callbacks(self._handle_result, self._handle_error,
55+
callback_kwargs={
56+
"future": future,
57+
"callback": callback,
58+
})
3859

3960
def __enter__(self) -> "ConcurrentQueries":
4061
return super().__enter__()
@@ -46,7 +67,8 @@ def __exit__(
4667
_exc_traceback: Optional[TracebackType],
4768
) -> bool:
4869
with self._completion:
49-
self._completion.wait()
70+
while self._error is None and self._pending > 0:
71+
self._completion.wait()
5072

5173
if self._error is not None:
5274
raise self._error
Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
11
from enum import Enum
2-
from typing import Optional, Set
3-
4-
from langchain_core.pydantic_v1 import BaseModel, Field
5-
62

73
class Kind(str, Enum):
84
document = "document"
@@ -25,15 +21,4 @@ class Kind(str, Enum):
2521
"""An image within a document."""
2622

2723
table = "table"
28-
"""A table within a document."""
29-
30-
31-
class Content(BaseModel):
32-
source_id: str
33-
content_id: str
34-
parent_id: Optional[str] = None
35-
kind: Kind
36-
keywords: Set[str] = Field(default_factory=set)
37-
urls: Set[str] = Field(default_factory=set)
38-
links: Set[str] = Field(default_factory=set)
39-
text_content: Optional[str] = None
24+
"""A table within a document."""

0 commit comments

Comments
 (0)