fix: prioritize named speakers in KG retrieval

hammertoe · hammertoe · commit a3a007752942 · 2026-03-14T17:55:51.000-04:00
diff --git a/lib/kg_agent_loop.py b/lib/kg_agent_loop.py
@@ -11,6 +11,7 @@
 from google import genai
 from google.genai import types
 
+from lib.id_generators import normalize_label
 from lib.kg_hybrid_graph_rag import kg_hybrid_graph_rag_with_bills as kg_hybrid_graph_rag
 from lib.utils.config import config
 
@@ -83,6 +84,47 @@ def _format_tool_result_summary(result: dict[str, Any]) -> str:
     return ", ".join(out)
 
 
+def _augment_query_with_speakers(
+    *, postgres: Any, query: str, user_message: str, max_speakers: int = 2
+) -> str:
+    base = (query or "").strip()
+    if not base:
+        return base
+
+    message_norm = normalize_label(user_message)
+    if not message_norm:
+        return base
+
+    rows = postgres.execute_query(
+        """
+        SELECT full_name, normalized_name
+        FROM speakers
+        WHERE %s LIKE '%' || normalized_name || '%'
+        ORDER BY length(normalized_name) DESC
+        LIMIT %s
+        """,
+        (message_norm, int(max_speakers)),
+    )
+
+    if not rows:
+        return base
+
+    query_norm = normalize_label(base)
+    additions: list[str] = []
+    for full_name, normalized_name in rows:
+        candidate = (full_name or normalized_name or "").strip()
+        if not candidate:
+            continue
+        if normalize_label(candidate) in query_norm:
+            continue
+        additions.append(candidate)
+
+    if not additions:
+        return base
+
+    return f"{base} {' '.join(additions)}".strip()
+
+
 def _truncate_text(text: str, max_len: int = 300) -> str:
     """Truncate text to max_len with ellipsis."""
     if not text or len(text) <= max_len:
@@ -794,10 +836,16 @@ async def run(self, *, user_message: str, history: list[dict[str, str]]) -> dict
                         self.progress_callback(
                             "searching", "Finding relevant debates (graph + citations)..."
                         )
+                    base_query = str(fc.args.get("query", ""))
+                    resolved_query = _augment_query_with_speakers(
+                        postgres=self.postgres,
+                        query=base_query,
+                        user_message=user_message,
+                    )
                     tool_result = kg_hybrid_graph_rag(
                         postgres=self.postgres,
                         embedding_client=self.embedding_client,
-                        query=str(fc.args.get("query", "")),
+                        query=resolved_query,
                         hops=int(fc.args.get("hops", 1)),
                         seed_k=int(fc.args.get("seed_k", 12)),
                         max_edges=int(fc.args.get("max_edges", 90)),
diff --git a/lib/kg_hybrid_graph_rag.py b/lib/kg_hybrid_graph_rag.py
@@ -676,6 +676,24 @@ def _retrieve_seed_nodes(
         except Exception:
             pass
 
+    query_lower = query.lower()
+    query_terms = _query_terms(query_lower)
+    if len(query_terms) >= 2:
+
+        def _candidate_in_query(candidate: dict[str, Any]) -> bool:
+            label = str(candidate.get("label") or "").lower().strip()
+            if label and label in query_lower:
+                return True
+            for alias in candidate.get("aliases") or []:
+                alias_str = str(alias or "").lower().strip()
+                if alias_str and alias_str in query_lower:
+                    return True
+            return False
+
+        matched = [c for c in fused_candidates if _candidate_in_query(c)]
+        if matched:
+            return matched[:seed_k]
+
     return fused_candidates[:seed_k]
 
 
@@ -693,8 +711,9 @@ def _retrieve_edges_hops_1(
         rows = postgres.execute_query(
             f"""
             SELECT id, source_id, predicate, predicate_raw, target_id,
-                   youtube_video_id, earliest_timestamp_str, earliest_seconds,
-                   utterance_ids, evidence, speaker_ids, confidence, edge_rank_score
+                   source_kind, source_ref_id, youtube_video_id,
+                   earliest_timestamp_str, earliest_seconds,
+                   evidence_ids, utterance_ids, evidence, speaker_ids, confidence, edge_rank_score
             FROM kg_edges
             WHERE source_id IN ({placeholders}) OR target_id IN ({placeholders})
             ORDER BY edge_rank_score DESC NULLS LAST, confidence DESC NULLS LAST, earliest_seconds ASC
@@ -706,8 +725,9 @@ def _retrieve_edges_hops_1(
         rows = postgres.execute_query(
             f"""
             SELECT id, source_id, predicate, predicate_raw, target_id,
-                   youtube_video_id, earliest_timestamp_str, earliest_seconds,
-                   utterance_ids, evidence, speaker_ids, confidence
+                   source_kind, source_ref_id, youtube_video_id,
+                   earliest_timestamp_str, earliest_seconds,
+                   evidence_ids, utterance_ids, evidence, speaker_ids, confidence
             FROM kg_edges
             WHERE source_id IN ({placeholders}) OR target_id IN ({placeholders})
             ORDER BY confidence DESC NULLS LAST, earliest_seconds ASC
@@ -717,23 +737,50 @@ def _retrieve_edges_hops_1(
         )
     out: list[dict[str, Any]] = []
     for row in rows:
+        if len(row) >= 16 and str(row[5]) in {"transcript", "bill"}:
+            source_kind = str(row[5])
+            source_ref_id = str(row[6] or "")
+            youtube_video_id = row[7]
+            earliest_timestamp_str = row[8]
+            earliest_seconds = row[9]
+            evidence_ids = row[10] or []
+            legacy_utterance_ids = row[11] or []
+            evidence = row[12]
+            speaker_ids = row[13] or []
+            confidence = row[14]
+            edge_rank_score = row[15]
+        else:
+            # Legacy row shape before provenance cutover.
+            source_kind = "transcript"
+            source_ref_id = str(row[5] or "")
+            youtube_video_id = row[5]
+            earliest_timestamp_str = row[6]
+            earliest_seconds = row[7]
+            legacy_utterance_ids = row[8] or []
+            evidence_ids = legacy_utterance_ids
+            evidence = row[9]
+            speaker_ids = row[10] or []
+            confidence = row[11] if len(row) > 11 else None
+            edge_rank_score = row[12] if len(row) > 12 else None
+
         out.append(
             {
                 "id": row[0],
                 "source_id": row[1],
                 "predicate": row[2],
                 "predicate_raw": row[3],
                 "target_id": row[4],
-                "youtube_video_id": row[5],
-                "earliest_timestamp_str": row[6],
-                "earliest_seconds": int(row[7] or 0),
-                "utterance_ids": row[8] or [],
-                "evidence": row[9],
-                "speaker_ids": row[10] or [],
-                "confidence": float(row[11]) if row[11] is not None else None,
-                "edge_rank_score": float(row[12])
-                if len(row) > 12 and row[12] is not None
-                else None,
+                "source_kind": str(source_kind),
+                "source_ref_id": str(source_ref_id or ""),
+                "youtube_video_id": youtube_video_id,
+                "earliest_timestamp_str": earliest_timestamp_str,
+                "earliest_seconds": int(earliest_seconds or 0),
+                "evidence_ids": evidence_ids,
+                "utterance_ids": legacy_utterance_ids or [],
+                "evidence": evidence,
+                "speaker_ids": speaker_ids or [],
+                "confidence": float(confidence) if confidence is not None else None,
+                "edge_rank_score": float(edge_rank_score) if edge_rank_score is not None else None,
             }
         )
     return out
@@ -758,16 +805,78 @@ def _hydrate_nodes(
     return [{"id": r[0], "label": r[1], "type": r[2]} for r in rows]
 
 
+def _hydrate_bill_citations_from_ids(
+    *,
+    postgres: Any,
+    bill_citation_ids: list[str],
+) -> list[dict[str, Any]]:
+    if not bill_citation_ids:
+        return []
+
+    out: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for cid in bill_citation_ids:
+        if cid in seen:
+            continue
+        seen.add(cid)
+
+        parts = cid.split(":")
+        if len(parts) != 3 or parts[0] != "bill":
+            continue
+        bill_id = parts[1]
+        try:
+            chunk_index = int(parts[2])
+        except Exception:
+            continue
+
+        rows = postgres.execute_query(
+            """
+            SELECT b.id, b.bill_number, b.title, be.text, be.source_url, be.chunk_index, be.page_number
+            FROM bill_excerpts be
+            JOIN bills b ON b.id = be.bill_id
+            WHERE be.bill_id = %s AND be.chunk_index = %s
+            LIMIT 1
+            """,
+            (bill_id, chunk_index),
+        )
+        if not rows:
+            continue
+
+        row = rows[0]
+        page_number = int(row[6]) if row[6] is not None else None
+        source_url = _url_with_page_fragment(str(row[4] or ""), page_number)
+        out.append(
+            {
+                "citation_id": cid,
+                "bill_id": str(row[0] or ""),
+                "bill_number": row[1] or "",
+                "bill_title": row[2] or "",
+                "excerpt": row[3] or "",
+                "source_url": source_url,
+                "chunk_index": int(row[5] or 0),
+                "page_number": page_number,
+                "matched_terms": [],
+                "score": 1.0,
+            }
+        )
+    return out
+
+
 def _hydrate_citations(
     *,
     postgres: Any,
-    utterance_ids: list[str],
+    evidence_ids: list[str],
     max_citations: int,
 ) -> list[dict[str, Any]]:
-    if not utterance_ids:
+    if not evidence_ids:
         return []
-    utterance_ids = utterance_ids[:max_citations]
-    placeholders = ",".join(["%s"] * len(utterance_ids))
+
+    transcript_ids = [eid for eid in evidence_ids if not str(eid).startswith("bill:")]
+    transcript_ids = transcript_ids[:max_citations]
+    if not transcript_ids:
+        return []
+
+    placeholders = ",".join(["%s"] * len(transcript_ids))
     rows = postgres.execute_query(
         f"""
         SELECT s.id, s.text, s.seconds_since_start, s.timestamp_str,
@@ -800,7 +909,7 @@ def _hydrate_citations(
         LEFT JOIN speakers sp ON s.speaker_id = sp.id
         WHERE s.id IN ({placeholders})
         """,
-        tuple(utterance_ids),
+        tuple(transcript_ids),
     )
 
     order_paper_idx = _load_order_paper_speaker_index(postgres=postgres)
@@ -1009,11 +1118,15 @@ def kg_hybrid_graph_rag(
         e["target_label"] = target.get("label")
         e["target_type"] = target.get("type")
 
-    utterance_ids: list[str] = []
+    evidence_ids: list[str] = []
+    bill_evidence_ids: list[str] = []
     for e in edges:
-        for uid in e.get("utterance_ids", []) or []:
-            if uid not in utterance_ids:
-                utterance_ids.append(uid)
+        edge_evidence_ids = e.get("evidence_ids") or e.get("utterance_ids") or []
+        for evidence_id in edge_evidence_ids:
+            if evidence_id not in evidence_ids:
+                evidence_ids.append(evidence_id)
+            if str(evidence_id).startswith("bill:") and evidence_id not in bill_evidence_ids:
+                bill_evidence_ids.append(evidence_id)
 
     edges_filtered: int = 0
     edge_rank_filter_skipped_no_scores = False
@@ -1035,9 +1148,13 @@ def kg_hybrid_graph_rag(
 
     citations = _hydrate_citations(
         postgres=postgres,
-        utterance_ids=utterance_ids,
+        evidence_ids=evidence_ids,
         max_citations=max_citations,
     )
+    bill_citations_from_edges = _hydrate_bill_citations_from_ids(
+        postgres=postgres,
+        bill_citation_ids=bill_evidence_ids,
+    )
 
     debug_info: dict[str, Any] = {
         "seed_count": len(seeds),
@@ -1060,6 +1177,7 @@ def kg_hybrid_graph_rag(
         "nodes": nodes,
         "edges": edges,
         "citations": citations,
+        "bill_citations_from_edges": bill_citations_from_edges,
         "debug": debug_info,
     }
 
@@ -1231,6 +1349,16 @@ def kg_hybrid_graph_rag_with_bills(
         query_embedding=query_embedding,
     )
 
+    for edge_citation in result.get("bill_citations_from_edges", []):
+        cid = str(edge_citation.get("citation_id") or "")
+        if not cid:
+            continue
+        exists = any(str(c.get("citation_id") or "") == cid for c in bill_citations)
+        if not exists:
+            bill_citations.append(edge_citation)
+
+    bill_citations = bill_citations[:max_bill_citations]
+
     result["bill_citations"] = bill_citations
     result["debug"]["bill_citation_count"] = len(bill_citations)
 
diff --git a/tests/test_kg_agent_loop_unit.py b/tests/test_kg_agent_loop_unit.py
@@ -63,6 +63,11 @@ def execute_update(self, _sql: str, _params: Any = None):
         return None
 
 
+class _FakePostgresSpeakerMatch(_FakePostgres):
+    def execute_query(self, _sql: str, _params: Any = None):
+        return [("Tamaisha Eytle Harvey", "tamaisha eytle harvey")]
+
+
 class _FakeEmbedding:
     def generate_query_embedding(self, _query: str) -> list[float]:
         return [0.0] * 768
@@ -86,6 +91,22 @@ def test_system_prompt_includes_current_date_and_recency_guidance() -> None:
     assert "When the user asks for recent" in prompt
 
 
+def test_augment_query_with_speakers_appends_name() -> None:
+    from lib.kg_agent_loop import _augment_query_with_speakers
+
+    postgres = _FakePostgresSpeakerMatch()
+    query = "Future Barbados health tech"
+    user_message = "What did Tamaisha Eytle Harvey say about Future Barbados?"
+
+    augmented = _augment_query_with_speakers(
+        postgres=postgres,
+        query=query,
+        user_message=user_message,
+    )
+
+    assert "Tamaisha Eytle Harvey" in augmented
+
+
 def test_agent_loop_runs_tool_then_answers():
     from lib.kg_agent_loop import KGAgentLoop
 
diff --git a/tests/test_kg_hybrid_graph_rag_unit.py b/tests/test_kg_hybrid_graph_rag_unit.py