Skip to content

Commit 94cd787

Browse files
authored
feat: Eliminate the targets table (#588)
* feat: Eliminate the targets table This seems to be unnecessary. An SAI index supporting contains seems to be sufficient. Eliminating it removes the need to create the denormalized copy of the data, speeding up insertion and reducing storage needs.
1 parent 8c8014d commit 94cd787

File tree

5 files changed

+23
-64
lines changed

5 files changed

+23
-64
lines changed

libs/knowledge-store/notebooks/astra_support.ipynb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,6 @@
258258
"graph_store = CassandraGraphStore(\n",
259259
" embeddings,\n",
260260
" node_table=f\"{SITE_PREFIX}_nodes\",\n",
261-
" targets_table=f\"{SITE_PREFIX}_targets\",\n",
262261
")"
263262
]
264263
},

libs/knowledge-store/ragstack_knowledge_store/graph_store.py

Lines changed: 23 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ def __init__(
127127
embedding: EmbeddingModel,
128128
*,
129129
node_table: str = "graph_nodes",
130-
targets_table: str = "graph_targets",
131130
session: Optional[Session] = None,
132131
keyspace: Optional[str] = None,
133132
setup_mode: SetupMode = SetupMode.SYNC,
@@ -141,12 +140,8 @@ def __init__(
141140
if not _CQL_IDENTIFIER_PATTERN.fullmatch(node_table):
142141
raise ValueError(f"Invalid node table name: {node_table}")
143142

144-
if not _CQL_IDENTIFIER_PATTERN.fullmatch(targets_table):
145-
raise ValueError(f"Invalid node table name: {targets_table}")
146-
147143
self._embedding = embedding
148144
self._node_table = node_table
149-
self._targets_table = targets_table
150145
self._session = session
151146
self._keyspace = keyspace
152147

@@ -163,16 +158,8 @@ def __init__(
163158
f"""
164159
INSERT INTO {keyspace}.{node_table} (
165160
content_id, kind, text_content, text_embedding, link_to_tags,
166-
metadata_blob, links_blob
167-
) VALUES (?, '{Kind.passage}', ?, ?, ?, ?, ?)
168-
""" # noqa: S608
169-
)
170-
171-
self._insert_tag = session.prepare(
172-
f"""
173-
INSERT INTO {keyspace}.{targets_table} (
174-
target_content_id, kind, tag, target_text_embedding, target_link_to_tags
175-
) VALUES (?, ?, ?, ?, ?)
161+
link_from_tags, metadata_blob, links_blob
162+
) VALUES (?, '{Kind.passage}', ?, ?, ?, ?, ?, ?)
176163
""" # noqa: S608
177164
)
178165

@@ -236,56 +223,44 @@ def __init__(
236223

237224
self._query_targets_embeddings_by_kind_and_tag_and_embedding = session.prepare(
238225
f"""
239-
SELECT target_content_id, target_text_embedding, tag, target_link_to_tags
240-
FROM {keyspace}.{targets_table}
241-
WHERE kind = ? AND tag = ?
242-
ORDER BY target_text_embedding ANN of ?
226+
SELECT
227+
content_id AS target_content_id,
228+
text_embedding AS target_text_embedding,
229+
link_to_tags AS target_link_to_tags
230+
FROM {keyspace}.{node_table}
231+
WHERE link_from_tags CONTAINS (?, ?)
232+
ORDER BY text_embedding ANN of ?
243233
LIMIT ?
244-
""" # noqa: S608
234+
"""
245235
)
246236

247237
self._query_targets_by_kind_and_value = session.prepare(
248238
f"""
249-
SELECT target_content_id, kind, tag
250-
FROM {keyspace}.{targets_table}
251-
WHERE kind = ? AND tag = ?
252-
""" # noqa: S608
239+
SELECT
240+
content_id AS target_content_id
241+
FROM {keyspace}.{node_table}
242+
WHERE link_from_tags CONTAINS (?, ?)
243+
"""
253244
)
254245

255246
def _apply_schema(self) -> None:
256247
"""Apply the schema to the database."""
257248
embedding_dim = len(self._embedding.embed_query("Test Query"))
258-
self._session.execute(
259-
f"""CREATE TABLE IF NOT EXISTS {self._keyspace}.{self._node_table} (
249+
self._session.execute(f"""
250+
CREATE TABLE IF NOT EXISTS {self._keyspace}.{self._node_table} (
260251
content_id TEXT,
261252
kind TEXT,
262253
text_content TEXT,
263254
text_embedding VECTOR<FLOAT, {embedding_dim}>,
264255
265256
link_to_tags SET<TUPLE<TEXT, TEXT>>,
257+
link_from_tags SET<TUPLE<TEXT, TEXT>>,
266258
metadata_blob TEXT,
267259
links_blob TEXT,
268260
269261
PRIMARY KEY (content_id)
270262
)
271-
"""
272-
)
273-
274-
self._session.execute(
275-
f"""CREATE TABLE IF NOT EXISTS {self._keyspace}.{self._targets_table} (
276-
target_content_id TEXT,
277-
kind TEXT,
278-
tag TEXT,
279-
280-
-- text_embedding of target node. allows MMR to be applied without
281-
-- fetching nodes.
282-
target_text_embedding VECTOR<FLOAT, {embedding_dim}>,
283-
target_link_to_tags SET<TUPLE<TEXT, TEXT>>,
284-
285-
PRIMARY KEY ((kind, tag), target_content_id)
286-
)
287-
"""
288-
)
263+
""")
289264

290265
# Index on text_embedding (for similarity search)
291266
self._session.execute(f"""
@@ -294,12 +269,11 @@ def _apply_schema(self) -> None:
294269
USING 'StorageAttachedIndex';
295270
""")
296271

297-
# Index on target_text_embedding (for similarity search)
298272
self._session.execute(f"""
299-
CREATE CUSTOM INDEX IF NOT EXISTS {self._targets_table}_target_text_embedding_index
300-
ON {self._keyspace}.{self._targets_table}(target_text_embedding)
273+
CREATE CUSTOM INDEX IF NOT EXISTS {self._node_table}_link_from_tags
274+
ON {self._keyspace}.{self._node_table}(link_from_tags)
301275
USING 'StorageAttachedIndex';
302-
""") # noqa: E501
276+
""")
303277

304278
def _concurrent_queries(self) -> ConcurrentQueries:
305279
return ConcurrentQueries(self._session)
@@ -348,17 +322,12 @@ def add_nodes(
348322
text,
349323
text_embedding,
350324
link_to_tags,
325+
link_from_tags,
351326
metadata_blob,
352327
links_blob,
353328
),
354329
)
355330

356-
for kind, value in link_from_tags:
357-
cq.execute(
358-
self._insert_tag,
359-
parameters=(node_id, kind, value, text_embedding, link_to_tags),
360-
)
361-
362331
return node_ids
363332

364333
def _nodes_with_ids(

libs/knowledge-store/tests/integration_tests/test_graph_store.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,11 @@ def _make_graph_store() -> GraphStore:
5151
name = secrets.token_hex(8)
5252

5353
node_table = f"nodes_{name}"
54-
targets_table = f"targets_{name}"
5554
return GraphStore(
5655
embedding,
5756
session=session,
5857
keyspace=KEYSPACE,
5958
node_table=node_table,
60-
targets_table=targets_table,
6159
)
6260

6361
yield _make_graph_store

libs/langchain/ragstack_langchain/graph_store/cassandra.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ def __init__(
3939
embedding: Embeddings,
4040
*,
4141
node_table: str = "graph_nodes",
42-
targets_table: str = "graph_targets",
4342
session: Optional[Session] = None,
4443
keyspace: Optional[str] = None,
4544
setup_mode: SetupMode = SetupMode.SYNC,
@@ -63,7 +62,6 @@ def __init__(
6362
self.store = graph_store.GraphStore(
6463
embedding=_EmbeddingModelAdapter(embedding),
6564
node_table=node_table,
66-
targets_table=targets_table,
6765
session=session,
6866
keyspace=keyspace,
6967
setup_mode=_setup_mode,

libs/langchain/tests/integration_tests/test_graph_store.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ def __init__(self, session: Session, keyspace: str, embedding: Embeddings) -> No
2020
self.keyspace = keyspace
2121
self.uid = secrets.token_hex(8)
2222
self.node_table = f"nodes_{self.uid}"
23-
self.targets_table = f"targets_{self.uid}"
2423
self.embedding = embedding
2524
self._store = None
2625

@@ -39,17 +38,13 @@ def store(
3938
session=self.session,
4039
keyspace=self.keyspace,
4140
node_table=self.node_table,
42-
targets_table=self.targets_table,
4341
ids=ids,
4442
)
4543

4644
return self._store
4745

4846
def drop(self):
4947
self.session.execute(f"DROP TABLE IF EXISTS {self.keyspace}.{self.node_table};")
50-
self.session.execute(
51-
f"DROP TABLE IF EXISTS {self.keyspace}.{self.targets_table};"
52-
)
5348

5449

5550
@pytest.fixture(scope="session")

0 commit comments

Comments
 (0)