feat: improve query adapter algorithm (#146)

lsorber · web-flow · commit 0911c5b29aaa · 2025-05-27T09:46:05.000+02:00
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -26,7 +26,7 @@
 # Lazily load the default search method to avoid circular imports.
 # TODO: Replace with search_and_rerank_chunk_spans after benchmarking.
 def _vector_search(
-    query: str, *, num_results: int = 10, config: "RAGLiteConfig | None" = None
+    query: str, *, num_results: int = 8, config: "RAGLiteConfig | None" = None
 ) -> tuple[list[ChunkId], list[float]]:
     from raglite._search import vector_search
 
diff --git a/src/raglite/_database.py b/src/raglite/_database.py
@@ -136,7 +136,7 @@ def from_body(
         )
 
     @staticmethod
-    def extract_heading_lines(doc: str, leading_only: bool = False) -> list[str]:  # noqa: FBT001,FBT002
+    def extract_heading_lines(doc: str, leading_only: bool = False) -> list[str]:  # noqa: FBT001, FBT002
         """Extract the leading or final state of the Markdown headings of a document."""
         md = MarkdownIt()
         heading_lines = [""] * 6
diff --git a/src/raglite/_eval.py b/src/raglite/_eval.py
@@ -16,7 +16,7 @@
 from raglite._database import Chunk, Document, Eval, create_database_engine
 from raglite._extract import extract_with_llm
 from raglite._rag import add_context, rag, retrieve_context
-from raglite._search import hybrid_search, retrieve_chunk_spans, vector_search
+from raglite._search import retrieve_chunk_spans, vector_search
 
 
 def insert_evals(  # noqa: C901, PLR0912
@@ -95,7 +95,7 @@ def validate_question(cls, value: str) -> str:
             else:
                 question = question_response.question
             # Search for candidate chunks to answer the generated question.
-            candidate_chunk_ids, _ = hybrid_search(
+            candidate_chunk_ids, _ = vector_search(
                 query=question, num_results=max_contexts_per_eval, config=config
             )
             candidate_chunks = [session.get(Chunk, chunk_id) for chunk_id in candidate_chunk_ids]
diff --git a/src/raglite/_extract.py b/src/raglite/_extract.py
@@ -13,7 +13,7 @@
 def extract_with_llm(
     return_type: type[T],
     user_prompt: str | list[str],
-    strict: bool = False,  # noqa: FBT001,FBT002
+    strict: bool = False,  # noqa: FBT001, FBT002
     config: RAGLiteConfig | None = None,
     **kwargs: Any,
 ) -> T:
diff --git a/src/raglite/_query_adapter.py b/src/raglite/_query_adapter.py
@@ -12,7 +12,7 @@
 from raglite._search import vector_search
 
 
-def update_query_adapter(  # noqa: PLR0915, C901
+def update_query_adapter(  # noqa: C901, PLR0915
     *,
     max_triplets: int = 4096,
     max_triplets_per_eval: int = 64,
@@ -78,59 +78,63 @@ def update_query_adapter(  # noqa: PLR0915, C901
         evals = session.exec(
             select(Eval).order_by(Eval.id).limit(max(8, max_triplets // max_triplets_per_eval))
         ).all()
-        if len(evals) * max_triplets_per_eval < len(chunk_embedding.embedding):
-            error_message = "First run `insert_evals()` to generate sufficient evals."
+        # Exit if there aren't enough evals to compute the query adapter.
+        embedding_dim = len(chunk_embedding.embedding)
+        required_evals = np.ceil(embedding_dim / max_triplets_per_eval) - len(evals)
+        if required_evals > 0:
+            error_message = f"First run `insert_evals()` to generate {required_evals} more evals."
             raise ValueError(error_message)
         # Loop over the evals to generate (q, p, n) triplets.
-        Q = np.zeros((0, len(chunk_embedding.embedding)))  # noqa: N806
+        Q = np.zeros((0, embedding_dim))  # noqa: N806
         P = np.zeros_like(Q)  # noqa: N806
         N = np.zeros_like(Q)  # noqa: N806
         for eval_ in tqdm(
             evals, desc="Extracting triplets from evals", unit="eval", dynamic_ncols=True
         ):
             # Embed the question.
-            question_embedding = embed_strings([eval_.question], config=config)[0]
+            question_embedding = embed_strings([eval_.question], config=config)
             # Retrieve chunks that would be used to answer the question.
             chunk_ids, _ = vector_search(
-                question_embedding, num_results=optimize_top_k, config=config_no_query_adapter
+                question_embedding[0], num_results=optimize_top_k, config=config_no_query_adapter
             )
             retrieved_chunks = session.exec(select(Chunk).where(col(Chunk.id).in_(chunk_ids))).all()
-            # Extract (q, p, n) triplets by comparing the retrieved chunks with the eval.
+            retrieved_chunks = sorted(retrieved_chunks, key=lambda chunk: chunk_ids.index(chunk.id))
+            # Extract (q, p, n) triplets from the eval.
             num_triplets = 0
             for i, retrieved_chunk in enumerate(retrieved_chunks):
-                # Select irrelevant chunks.
+                # Only loop over irrelevant chunks.
                 if retrieved_chunk.id not in eval_.chunk_ids:
-                    # Look up all positive chunks (each represented by the mean of its multi-vector
-                    # embedding) that are ranked lower than this negative one (represented by the
-                    # embedding in the multi-vector embedding that best matches the query).
-                    p_mean = [
-                        np.mean(chunk.embedding_matrix, axis=0, keepdims=True)
-                        for chunk in retrieved_chunks[i + 1 :]
-                        if chunk is not None and chunk.id in eval_.chunk_ids
+                    continue
+                irrelevant_chunk = retrieved_chunk
+                # Grab the negative chunk embedding of this irrelevant chunk.
+                n_top = irrelevant_chunk.embedding_matrix[
+                    [np.argmax(irrelevant_chunk.embedding_matrix @ question_embedding.T)]
+                ]
+                # Grab the positive chunk embeddings that are ranked lower than the negative one.
+                p_top = [
+                    chunk.embedding_matrix[
+                        [np.argmax(chunk.embedding_matrix @ question_embedding.T)]
                     ]
-                    n_top = retrieved_chunk.embedding_matrix[
-                        np.argmax(retrieved_chunk.embedding_matrix @ question_embedding.T),
-                        np.newaxis,
-                        :,
-                    ]
-                    # Filter out any (p, n, q) triplets for which the mean positive embedding ranks
-                    # higher than the top negative one.
-                    p_mean = [p_e for p_e in p_mean if (n_top - p_e) @ question_embedding.T > 0]
-                    if not p_mean:
-                        continue
-                    # Stack the (p, n, q) triplets.
-                    p = np.vstack(p_mean)
-                    n = np.repeat(n_top, p.shape[0], axis=0)
-                    q = np.repeat(question_embedding, p.shape[0], axis=0)
-                    num_triplets += p.shape[0]
-                    # Append the (query, positive, negative) tuples to the Q, P, N matrices.
-                    Q = np.vstack([Q, q])  # noqa: N806
-                    P = np.vstack([P, p])  # noqa: N806
-                    N = np.vstack([N, n])  # noqa: N806
-                    # Check if we have sufficient triplets for this eval.
-                    if num_triplets >= max_triplets_per_eval:
-                        break
-            # Check if we have sufficient triplets to compute the query adapter.
+                    for chunk in retrieved_chunks[i + 1 :]  # Chunks that are ranked lower.
+                    if chunk is not None and chunk.id in eval_.chunk_ids
+                ]
+                # Ensure that we only have (q, p, n) triplets for which p is ranked lower than n.
+                p_top = [p for p in p_top if (n_top - p) @ question_embedding.T > 0]
+                if not p_top:
+                    continue
+                # Stack the (q, p, n) triplets.
+                p = np.vstack(p_top)
+                n = np.repeat(n_top, p.shape[0], axis=0)
+                q = np.repeat(question_embedding, p.shape[0], axis=0)
+                num_triplets += p.shape[0]
+                # Append the (q, p, n) triplets to the Q, P, N matrices.
+                Q = np.vstack([Q, q])  # noqa: N806
+                P = np.vstack([P, p])  # noqa: N806
+                N = np.vstack([N, n])  # noqa: N806
+                # Stop if we have enough triplets for this eval.
+                if num_triplets >= max_triplets_per_eval:
+                    break
+            # Stop if we have enough triplets to compute the query adapter.
             if Q.shape[0] > max_triplets:
                 Q, P, N = Q[:max_triplets, :], P[:max_triplets, :], N[:max_triplets, :]  # noqa: N806
                 break
diff --git a/src/raglite/_search.py b/src/raglite/_search.py
@@ -280,7 +280,7 @@ def rerank_chunks(
 def search_and_rerank_chunks(
     query: str,
     *,
-    num_results: int = 10,
+    num_results: int = 8,
     oversample: int = 4,
     search: BasicSearchMethod = hybrid_search,
     config: RAGLiteConfig | None = None,
@@ -294,7 +294,7 @@ def search_and_rerank_chunks(
 def search_and_rerank_chunk_spans(  # noqa: PLR0913
     query: str,
     *,
-    num_results: int = 10,
+    num_results: int = 8,
     oversample: int = 4,
     neighbors: tuple[int, ...] | None = (-1, 1),
     search: BasicSearchMethod = hybrid_search,

Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ def from_body(`
`136`	`136`	`)`
`137`	`137`
`138`	`138`	`@staticmethod`
`139`		`- def extract_heading_lines(doc: str, leading_only: bool = False) -> list[str]: # noqa: FBT001,FBT002`
	`139`	`+ def extract_heading_lines(doc: str, leading_only: bool = False) -> list[str]: # noqa: FBT001, FBT002`
`140`	`140`	`"""Extract the leading or final state of the Markdown headings of a document."""`
`141`	`141`	`md = MarkdownIt()`
`142`	`142`	`heading_lines = [""] * 6`