datastax
diff --git a/‎.github/changes-filter.yaml
Lines changed: 1 addition & 0 deletions b/‎.github/changes-filter.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/ci-unit-tests.yml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/ci-unit-tests.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎libs/knowledge-store/pyproject.toml
Lines changed: 14 additions & 4 deletions b/‎libs/knowledge-store/pyproject.toml
Lines changed: 14 additions & 4 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/_mmr_helper.py
Lines changed: 22 additions & 19 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/_mmr_helper.py
Lines changed: 22 additions & 19 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/_utils.py
Lines changed: 7 additions & 5 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/_utils.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎libs/knowledge-store/ragstack_knowledge_store/concurrency.py
Lines changed: 23 additions & 11 deletions b/‎libs/knowledge-store/ragstack_knowledge_store/concurrency.py
Lines changed: 23 additions & 11 deletions
@@ -17,6 +17,7 @@ notebooks:
   - "./.github/workflows/_run_e2e_tests.yml"
 integration_tests:
   - "libs/colbert/**"
+  - "libs/knowledge-store/**"
   - "libs/llamaindex/**"
   - "libs/langchain/**"
   - "./.github/actions/**"
 
@@ -89,6 +89,11 @@ jobs:
       - name: "Type check (knowledge-graph)"
         run: tox -e type -c libs/knowledge-graph && rm -rf libs/knowledge-graph/.tox
 
+      - name: "Type check (knowledge-store)"
+        run: tox -e type -c libs/knowledge-store && rm -rf libs/knowledge-store/.tox
+
+      - name: "Type check (test-utils)"
+        run: tox -e type -c libs/tests-utils && rm -rf libs/tests-utils/.tox
 
   unit-tests:
     name: Unit Tests (Python ${{ matrix.python-version }})
 
@@ -38,11 +38,21 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.mypy]
-strict = true
-warn_unreachable = true
-pretty = true
-show_column_numbers = true
+disallow_any_generics = true
+disallow_incomplete_defs = true
+disallow_untyped_calls = true
+disallow_untyped_decorators = true
+disallow_untyped_defs = true
+follow_imports = "normal"
+ignore_missing_imports = true
+no_implicit_reexport = true
+show_error_codes = true
 show_error_context = true
+strict_equality = true
+strict_optional = true
+warn_redundant_casts = true
+warn_return_any = true
+warn_unused_ignores = true
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 
@@ -2,15 +2,16 @@
 from typing import Dict, Iterable, List, Optional
 
 import numpy as np
+from numpy.typing import NDArray
 
 from ragstack_knowledge_store.math import cosine_similarity
 
 
-def _emb_to_ndarray(embedding: List[float]) -> np.ndarray:
-    embedding = np.array(embedding, dtype=np.float32)
-    if embedding.ndim == 1:
-        embedding = np.expand_dims(embedding, axis=0)
-    return embedding
+def _emb_to_ndarray(embedding: List[float]) -> NDArray[np.float32]:
+    emb_array = np.array(embedding, dtype=np.float32)
+    if emb_array.ndim == 1:
+        emb_array = np.expand_dims(emb_array, axis=0)
+    return emb_array
 
 
 NEG_INF = float("-inf")
@@ -23,10 +24,10 @@ class _Candidate:
     weighted_redundancy: float
     score: float = dataclasses.field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         self.score = self.weighted_similarity - self.weighted_redundancy
 
-    def update_redundancy(self, new_weighted_redundancy: float):
+    def update_redundancy(self, new_weighted_redundancy: float) -> None:
         if new_weighted_redundancy > self.weighted_redundancy:
             self.weighted_redundancy = new_weighted_redundancy
             self.score = self.weighted_similarity - self.weighted_redundancy
@@ -47,7 +48,7 @@ class MmrHelper:
     dimensions: int
     """Dimensions of the embedding."""
 
-    query_embedding: np.ndarray
+    query_embedding: NDArray[np.float32]
     """Embedding of the query as a (1,dim) ndarray."""
 
     lambda_mult: float
@@ -64,7 +65,7 @@ class MmrHelper:
 
     selected_ids: List[str]
     """List of selected IDs (in selection order)."""
-    selected_embeddings: np.ndarray
+    selected_embeddings: NDArray[np.float32]
     """(N, dim) ndarray with a row for each selected node."""
 
     candidate_id_to_index: Dict[str, int]
@@ -74,7 +75,7 @@ class MmrHelper:
 
     Same order as rows in `candidate_embeddings`.
     """
-    candidate_embeddings: np.ndarray
+    candidate_embeddings: NDArray[np.float32]
     """(N, dim) ndarray with a row for each candidate."""
 
     best_score: float
@@ -113,12 +114,12 @@ def candidate_ids(self) -> Iterable[str]:
         """Return the IDs of the candidates."""
         return self.candidate_id_to_index.keys()
 
-    def _already_selected_embeddings(self) -> np.ndarray:
+    def _already_selected_embeddings(self) -> NDArray[np.float32]:
         """Return the selected embeddings sliced to the already assigned values."""
         selected = len(self.selected_ids)
         return np.vsplit(self.selected_embeddings, [selected])[0]
 
-    def _pop_candidate(self, candidate_id: str) -> np.ndarray:
+    def _pop_candidate(self, candidate_id: str) -> NDArray[np.float32]:
         """Pop the candidate with the given ID.
 
         Returns:
@@ -127,7 +128,7 @@ def _pop_candidate(self, candidate_id: str) -> np.ndarray:
         # Get the embedding for the id.
         index = self.candidate_id_to_index.pop(candidate_id)
         assert self.candidates[index].id == candidate_id
-        embedding = self.candidate_embeddings[index].copy()
+        embedding: NDArray[np.float32] = self.candidate_embeddings[index].copy()
 
         # Swap that index with the last index in the candidates and
         # candidate_embeddings.
@@ -186,19 +187,21 @@ def pop_best(self) -> Optional[str]:
 
         return selected_id
 
-    def add_candidates(self, candidates: Dict[str, List[float]]):
+    def add_candidates(self, candidates: Dict[str, List[float]]) -> None:
         """Add candidates to the consideration set."""
         # Determine the keys to actually include.
         # These are the candidates that aren't already selected
         # or under consideration.
-        include_ids = set(candidates.keys())
-        include_ids.difference_update(self.selected_ids)
-        include_ids.difference_update(self.candidate_id_to_index.keys())
-        include_ids = list(include_ids)
+        include_ids_set = set(candidates.keys())
+        include_ids_set.difference_update(self.selected_ids)
+        include_ids_set.difference_update(self.candidate_id_to_index.keys())
+        include_ids = list(include_ids_set)
 
         # Now, build up a matrix of the remaining candidate embeddings.
         # And add them to the
-        new_embeddings = np.ndarray((len(include_ids), self.dimensions))
+        new_embeddings: NDArray[np.float32] = np.ndarray(
+            (len(include_ids), self.dimensions)
+        )
         offset = self.candidate_embeddings.shape[0]
         for index, candidate_id in enumerate(include_ids):
             if candidate_id in include_ids:
 
@@ -1,18 +1,20 @@
+from __future__ import annotations
+
 import sys
 
 try:
     # Try importing the function from itertools (Python 3.12+)
-    from itertools import batched
+    from itertools import batched  # type: ignore[attr-defined]
 except ImportError:
     from itertools import islice
-    from typing import Iterable, Iterator, TypeVar
+    from typing import Any, Iterable, Iterator, TypeVar
 
     # Fallback implementation for older Python versions
 
     T = TypeVar("T")
 
     # This is equivalent to `itertools.batched`, but that is only available in 3.12
-    def batched(iterable: Iterable[T], n: int) -> Iterator[Iterator[T]]:
+    def batched(iterable: Iterable[T], n: int) -> Iterator[tuple[T, ...]]:
         if n < 1:
             raise ValueError("n must be at least one")
         it = iter(iterable)
@@ -24,12 +26,12 @@ def batched(iterable: Iterable[T], n: int) -> Iterator[Iterator[T]]:
 
 if sys.version_info >= (3, 10):
 
-    def strict_zip(*iterables):
+    def strict_zip(*iterables: Iterable[Any]) -> zip[tuple[Any, ...]]:
         return zip(*iterables, strict=True)
 
 else:
 
-    def strict_zip(*iterables):
+    def strict_zip(*iterables: Iterable[T]) -> zip[tuple[T]]:
         # Custom implementation for Python versions older than 3.10
         if not iterables:
             return
 
@@ -2,31 +2,43 @@
 import logging
 import threading
 from types import TracebackType
-from typing import Any, Callable, NamedTuple, Optional, Sequence, Tuple, Type
+from typing import (
+    Any,
+    Callable,
+    Literal,
+    NamedTuple,
+    Optional,
+    Protocol,
+    Sequence,
+    Tuple,
+    Type,
+)
 
 from cassandra.cluster import ResponseFuture, Session
 from cassandra.query import PreparedStatement
 
 logger = logging.getLogger(__name__)
 
 
-class ConcurrentQueries(contextlib.AbstractContextManager):
+class _Callback(Protocol):
+    def __call__(self, rows: Sequence[Any], /) -> None: ...
+
+
+class ConcurrentQueries(contextlib.AbstractContextManager["ConcurrentQueries"]):
     """Context manager for concurrent queries."""
 
     def __init__(self, session: Session) -> None:
         self._session = session
         self._completion = threading.Condition()
-
         self._pending = 0
-
-        self._error = None
+        self._error: Optional[BaseException] = None
 
     def _handle_result(
         self,
         result: Sequence[NamedTuple],
         future: ResponseFuture,
         callback: Optional[Callable[[Sequence[NamedTuple]], Any]],
-    ):
+    ) -> None:
         if callback is not None:
             callback(result)
 
@@ -38,7 +50,7 @@ def _handle_result(
                 if self._pending == 0:
                     self._completion.notify()
 
-    def _handle_error(self, error, future: ResponseFuture):
+    def _handle_error(self, error: BaseException, future: ResponseFuture) -> None:
         logger.error(
             "Error executing query: %s",
             future.query,
@@ -51,9 +63,9 @@ def _handle_error(self, error, future: ResponseFuture):
     def execute(
         self,
         query: PreparedStatement,
-        parameters: Optional[Tuple] = None,
-        callback: Optional[Callable[[Sequence[NamedTuple]], Any]] = None,
-    ):
+        parameters: Optional[Tuple[Any, ...]] = None,
+        callback: Optional[_Callback] = None,
+    ) -> None:
         """Execute a query concurrently.
 
         Because this is done concurrently, it expects a callback if you need
@@ -93,7 +105,7 @@ def __exit__(
         _exc_type: Optional[Type[BaseException]],
         _exc_inst: Optional[BaseException],
         _exc_traceback: Optional[TracebackType],
-    ) -> bool:
+    ) -> Literal[False]:
         with self._completion:
             while self._error is None and self._pending > 0:
                 self._completion.wait()