progress supporting metadata search

epinzur · epinzur · commit 95d54736cc50 · 2024-07-17T15:14:33.000-04:00
diff --git a/libs/knowledge-store/ragstack_knowledge_store/graph_store.py b/libs/knowledge-store/ragstack_knowledge_store/graph_store.py
@@ -27,6 +27,9 @@
 
 CONTENT_ID = "content_id"
 
+CONTENT_COLUMNS = "content_id, kind, text_content, attributes_blob, metadata_s, links_blob"
+
+SELECT_CQL_TEMPLATE = "SELECT {columns} FROM {table_name} {where_clause} {limit_clause};"
 
 @dataclass
 class Node:
@@ -105,8 +108,10 @@ def _deserialize_links(json_blob: Optional[str]) -> Set[Link]:
 
 
 def _row_to_node(row: Any) -> Node:
-    metadata_s = row.get("metadata_s", {})
-    attributes_blob = row.get("attributes_blob")
+    metadata_s = row.metadata_s
+    if metadata_s is None:
+        metadata_s = {}
+    attributes_blob = row.attributes_blob
     attributes_dict = _deserialize_metadata(attributes_blob) if attributes_blob is not None else {}
     links = _deserialize_links(row.links_blob)
     return Node(
@@ -164,7 +169,7 @@ def __init__(
         self._keyspace = keyspace
 
         self._metadata_indexing_policy = self._normalize_metadata_indexing_policy(
-            metadata_indexing
+            metadata_indexing=metadata_indexing,
         )
 
         if setup_mode == SetupMode.SYNC:
@@ -187,15 +192,15 @@ def __init__(
 
         self._query_by_id = session.prepare(
             f"""
-            SELECT content_id, kind, text_content, attributes_blob, links_blob
+            SELECT {CONTENT_COLUMNS}
             FROM {keyspace}.{node_table}
             WHERE content_id = ?
             """  # noqa: S608
         )
 
         self._query_by_embedding = session.prepare(
             f"""
-            SELECT content_id, kind, text_content, attributes_blob, links_blob
+            SELECT {CONTENT_COLUMNS}
             FROM {keyspace}.{node_table}
             ORDER BY text_embedding ANN OF ?
             LIMIT ?
@@ -307,6 +312,25 @@ def _apply_schema(self) -> None:
     def _concurrent_queries(self) -> ConcurrentQueries:
         return ConcurrentQueries(self._session)
 
+    def _parse_metadata(self, metadata: Dict[str, Any], is_query: bool) -> Tuple[str, Dict[str,str]]:
+        attributes_dict = {
+            k: self._coerce_string(v)
+            for k, v in metadata.items()
+            if not _is_metadata_field_indexed(k, self._metadata_indexing_policy)
+        }
+        if is_query and len(attributes_dict) > 0:
+            raise ValueError("Non-indexed metadata fields cannot be used in queries.")
+        attributes_blob = _serialize_metadata(attributes_dict)
+
+        metadata_indexed_dict = {
+            k: v
+            for k, v in metadata.items()
+            if _is_metadata_field_indexed(k, self._metadata_indexing_policy)
+        }
+        metadata_s = {k: self._coerce_string(v) for k, v in metadata_indexed_dict.items()}
+        return (attributes_blob, metadata_s)
+
+
     # TODO: Async (aadd_nodes)
     def add_nodes(
         self,
@@ -342,19 +366,7 @@ def add_nodes(
                     if tag.direction in {"out", "bidir"}:
                         link_to_tags.add((tag.kind, tag.tag))
 
-                attributes_dict = {
-                    k: self._coerce_string(v)
-                    for k, v in metadata.items()
-                    if not _is_metadata_field_indexed(k, self._metadata_indexing_policy)
-                }
-                attributes_blob = _serialize_metadata(attributes_dict)
-
-                metadata_indexed_dict = {
-                    k: v
-                    for k, v in metadata.items()
-                    if _is_metadata_field_indexed(k, self._metadata_indexing_policy)
-                }
-                metadata_s = {k: self._coerce_string(v) for k, v in metadata_indexed_dict.items()}
+                attributes_blob, metadata_s = self._parse_metadata(metadata=metadata, is_query=False)
 
                 links_blob = _serialize_links(links)
                 cq.execute(
@@ -380,7 +392,7 @@ def _nodes_with_ids(
         results: Dict[str, Optional[Node]] = {}
         with self._concurrent_queries() as cq:
 
-            def add_nodes(rows: Iterable[Any]) -> None:
+            def node_callback(rows: Iterable[Any]) -> None:
                 # Should always be exactly one row here. We don't need to check
                 #   1. The query is for a `ID == ?` query on the primary key.
                 #   2. If it doesn't exist, the `get_result` method below will
@@ -393,7 +405,7 @@ def add_nodes(rows: Iterable[Any]) -> None:
                     # Mark this node ID as being fetched.
                     results[node_id] = None
                     cq.execute(
-                        self._query_by_id, parameters=(node_id,), callback=add_nodes
+                        self._query_by_id, parameters=(node_id,), callback=node_callback
                     )
 
         def get_result(node_id: str) -> Node:
@@ -643,6 +655,18 @@ def similarity_search(
         for row in self._session.execute(self._query_by_embedding, (embedding, k)):
             yield _row_to_node(row)
 
+    def metadata_search(self, metadata: Dict[str, Any] = {}, n: Optional[int] = 5)-> Iterable[Node]:
+        query, params = self._get_metadata_search_cql(metadata=metadata, n=n)
+
+        prepared_query = self._session.prepare(query)
+
+        for row in self._session.execute(prepared_query, params):
+            yield _row_to_node(row)
+
+    def get_node(self, id: str) -> Node:
+        return self._nodes_with_ids(ids=[id])[0]
+
+
     def _get_outgoing_tags(
         self,
         source_ids: Iterable[str],
@@ -755,28 +779,6 @@ def _normalize_metadata_indexing_policy(
                 )
         return (mode, fields)
 
-    def _split_metadata_fields(self, md_dict: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Split the *indexed* part of the metadata in separate parts,
-        one per Cassandra column.
-
-        Currently: everything gets cast to a string and goes to a single table
-        column. This means:
-            - strings are fine
-            - floats and integers v: they are cast to str(v)
-            - booleans: 'true'/'false' (JSON style)
-            - None => 'null' (JSON style)
-            - anything else v => str(v), no questions asked
-
-        Caveat: one gets strings back when reading metadata
-        """
-
-        # TODO: more care about types here
-        stringy_part = {k: self._coerce_string(v) for k, v in md_dict.items()}
-        return {
-            "metadata_s": stringy_part,
-        }
-
     @staticmethod
     def _coerce_string(value: Any) -> str:
         if isinstance(value, str):
@@ -794,4 +796,39 @@ def _coerce_string(value: Any) -> str:
             return json.dumps(value)
         else:
             # when all else fails ...
-            return str(value)
+            return str(value)
+
+    def _extract_where_clause_blocks(
+        self, metadata: Dict[str, Any]
+    ) -> Tuple[str, List[Any]]:
+
+        attributes_blob, metadata_s = self._parse_metadata(metadata=metadata, is_query=True)
+
+        if len(metadata_s) == 0:
+            return "", []
+
+        wc_blocks: List[str] = []
+        vals_list: List[Any] = []
+
+        for k, v in sorted(metadata_s.items()):
+            wc_blocks.append(f"metadata_s['{k}'] = ?")
+            vals_list.append(v)
+
+        where_clause = "WHERE " + " AND ".join(wc_blocks)
+        return where_clause, vals_list
+
+
+    def _get_metadata_search_cql(self, n: int, metadata: Dict[str, Any]) -> Tuple[str, Tuple[Any, ...]]:
+        where_clause, get_cql_vals = self._extract_where_clause_blocks(metadata=metadata)
+        limit_clause = "LIMIT ?"
+        limit_cql_vals = [n]
+        select_vals = tuple(list(get_cql_vals) + limit_cql_vals)
+        #
+        select_cql = SELECT_CQL_TEMPLATE.format(
+            columns=CONTENT_COLUMNS,
+            table_name=f"{self._keyspace}.{self._node_table}",
+            where_clause=where_clause,
+            limit_clause=limit_clause,
+
+        )
+        return select_cql, select_vals
diff --git a/libs/knowledge-store/tests/integration_tests/test_graph_store.py b/libs/knowledge-store/tests/integration_tests/test_graph_store.py
@@ -1,10 +1,10 @@
 import secrets
-from typing import Callable, Iterator, List
+from typing import Callable, Iterator, List, Optional
 
 import pytest
 from dotenv import load_dotenv
 from ragstack_knowledge_store import EmbeddingModel
-from ragstack_knowledge_store.graph_store import GraphStore
+from ragstack_knowledge_store.graph_store import GraphStore, Node
 from ragstack_tests_utils import LocalCassandraTestStore
 
 load_dotenv()
@@ -47,7 +47,7 @@ def graph_store_factory(
 
     embedding = DummyEmbeddingModel()
 
-    def _make_graph_store() -> GraphStore:
+    def _make_graph_store(metadata_indexing: Optional[str] = "all") -> GraphStore:
         name = secrets.token_hex(8)
 
         node_table = f"nodes_{name}"
@@ -56,16 +56,113 @@ def _make_graph_store() -> GraphStore:
             session=session,
             keyspace=KEYSPACE,
             node_table=node_table,
+            metadata_indexing=metadata_indexing,
         )
 
     yield _make_graph_store
 
     session.shutdown()
 
 
-def test_graph_store_creation(graph_store_factory: Callable[[], GraphStore]) -> None:
+def test_graph_store_creation(graph_store_factory: Callable[[str], GraphStore]) -> None:
     """Test that a graph store can be created.
 
     This verifies the schema can be applied and the queries prepared.
     """
     graph_store_factory()
+
+def test_graph_store_metadata(graph_store_factory: Callable[[str], GraphStore]) -> None:
+    gs = graph_store_factory()
+
+    gs.add_nodes([Node(text="bb1", id="row1")])
+    gotten1 = gs.get_node(id="row1")
+    assert gotten1 == Node(text="bb1", id="row1", metadata={})
+
+    gs.add_nodes([Node(text=None, id="row2", metadata={})])
+    gotten2 = gs.get_node(id="row2")
+    assert gotten2 == Node(text=None, id="row2", metadata={})
+
+    md3 = {"a": 1, "b": "Bee", "c": True}
+    md3_string = {"a": "1.0", "b": "Bee", "c": "true"}
+    gs.add_nodes([Node(text=None, id="row3", metadata=md3)])
+    gotten3 = gs.get_node(id="row3")
+    assert gotten3 == Node(text=None, id="row3", metadata=md3_string)
+
+    md4 = {"c1": True, "c2": True, "c3": True}
+    md4_string = {"c1": "true", "c2": "true", "c3": "true"}
+    gs.add_nodes([Node(text=None, id="row4", metadata=md4)])
+    gotten4 = gs.get_node(id="row4")
+    assert gotten4 == Node(text=None, id="row4", metadata=md4_string)
+
+    # metadata searches:
+    md_gotten3a = list(gs.metadata_search(metadata={"a": 1}))[0]
+    assert md_gotten3a == gotten3
+    md_gotten3b = list(gs.metadata_search(metadata={"b": "Bee", "c": True}))[0]
+    assert md_gotten3b == gotten3
+    md_gotten4 = list(gs.metadata_search(metadata={"c1": True, "c3": True}))[0]
+    assert md_gotten4 == gotten4
+
+    # 'search' proper
+    gs.add_nodes([
+        Node(text=None, id="twin_a", metadata={"twin": True, "index": 0}),
+        Node(text=None, id="twin_b", metadata={"twin": True, "index": 1})
+    ])
+    md_twins_gotten = sorted(
+        list(gs.metadata_search(metadata={"twin": True})),
+        key=lambda res: int(float(res.metadata["index"]))
+    )
+    expected = [
+        Node(text=None, id="twin_a", metadata={"twin": "true", "index": "0.0"}),
+        Node(text=None, id="twin_b", metadata={"twin": "true", "index": "1.0"}),
+    ]
+    assert md_twins_gotten == expected
+    assert list(gs.metadata_search(metadata={"fake": True})) == []
+
+def test_graph_store_metadata_routing(graph_store_factory: Callable[[str], GraphStore]) -> None:
+    test_md = {"mds": "string", "mdn": 255, "mdb": True}
+    test_md_string = {"mds": "string", "mdn": "255.0", "mdb": "true"}
+
+    gs_all = graph_store_factory(metadata_indexing="all")
+    gs_all.add_nodes([Node(id="row1", text="bb1", metadata=test_md)])
+    gotten_all = list(gs_all.metadata_search(metadata={"mds": "string"}))[0]
+    assert gotten_all.metadata == test_md_string
+    #
+    gs_none = graph_store_factory(metadata_indexing="none")
+    gs_none.add_nodes([Node(id="row1", text="bb1", metadata=test_md)])
+    with pytest.raises(ValueError):
+        # querying on non-indexed metadata fields:
+        list(gs_none.metadata_search(metadata={"mds": "string"}))
+    gotten_none = gs_none.get_node(id="row1")
+    assert gotten_none is not None
+    assert gotten_none.metadata == test_md_string
+    #
+    test_md_allowdeny = {
+        "mdas": "MDAS",
+        "mdds": "MDDS",
+        "mdan": 255,
+        "mddn": 127,
+        "mdab": True,
+        "mddb": True,
+    }
+    test_md_allowdeny_string = {
+        "mdas": "MDAS",
+        "mdds": "MDDS",
+        "mdan": "255.0",
+        "mddn": "127.0",
+        "mdab": "true",
+        "mddb": "true",
+    }
+    #
+    gs_allow = graph_store_factory(metadata_indexing=("allow", {"mdas", "mdan", "mdab"}))
+    gs_allow.add_nodes([Node(id="row1", text="bb1", metadata=test_md_allowdeny)])
+    with pytest.raises(ValueError):
+        list(gs_allow.metadata_search(metadata={"mdds": "MDDS"}))
+    gotten_allow = list(gs_allow.metadata_search(metadata={"mdas": "MDAS"}))[0]
+    assert gotten_allow.metadata == test_md_allowdeny_string
+    #
+    gs_deny = graph_store_factory(metadata_indexing=("deny", {"mdds", "mddn", "mddb"}))
+    gs_deny.add_nodes([Node(id="row1", text="bb1", metadata=test_md_allowdeny)])
+    with pytest.raises(ValueError):
+        list(gs_deny.metadata_search(metadata={"mdds": "MDDS"}))
+    gotten_deny = list(gs_deny.metadata_search(metadata={"mdas": "MDAS"}))[0]
+    assert gotten_deny.metadata == test_md_allowdeny_string