fix: stabilize transcript identity and preserve ingestion context

hammertoe · hammertoe · commit a41496020263 · 2026-02-13T20:03:25.000-04:00
diff --git a/lib/chat_agent_v2.py b/lib/chat_agent_v2.py
@@ -114,7 +114,7 @@ def _looks_like_utterance_id(citation_id: str) -> bool:
         return False
     if normalized.startswith("utt_"):
         normalized = normalized[4:]
-    return re.match(r"^[A-Za-z0-9_-]+:\d+$", normalized) is not None
+    return re.match(r"^[A-Za-z0-9_-]+:\d+(?:_\d+)?$", normalized) is not None
 
 
 def _merge_cite_utterance_ids(
@@ -136,7 +136,7 @@ def _merge_cite_utterance_ids(
 
     suffix_counts: dict[str, int] = {}
     for known_id in known_ids:
-        match = re.search(r":(\d+)$", known_id)
+        match = re.search(r":(\d+)(?:_\d+)?$", known_id)
         if not match:
             continue
         seconds = match.group(1)
@@ -166,7 +166,7 @@ def _merge_cite_utterance_ids(
                             (
                                 known_id
                                 for known_id in known_ids
-                                if known_id.endswith(f":{seconds}")
+                                if re.search(rf":{re.escape(seconds)}(?:_\d+)?$", known_id)
                             ),
                             None,
                         )
diff --git a/lib/processors/paragraph_splitter.py b/lib/processors/paragraph_splitter.py
@@ -83,13 +83,21 @@ def split_paragraph_into_sentences(
     youtube_video_id: str,
     video_date: str | None = None,
     video_title: str | None = None,
+    existing_sentence_ids: set[str] | None = None,
 ) -> list[dict[str, Any]]:
     """Split a paragraph into individual sentences with IDs."""
     sentences = []
+    seen_ids = existing_sentence_ids if existing_sentence_ids is not None else set()
 
     for i, entry in enumerate(paragraph.sentences):
         start_seconds = parse_timestamp_to_seconds(entry["start"])
-        sentence_id = f"{youtube_video_id}:{start_seconds}"
+        base_sentence_id = f"{youtube_video_id}:{start_seconds}"
+        sentence_id = base_sentence_id
+        suffix = 2
+        while sentence_id in seen_ids:
+            sentence_id = f"{base_sentence_id}_{suffix}"
+            suffix += 1
+        seen_ids.add(sentence_id)
 
         sentences.append(
             {
diff --git a/lib/processors/three_tier_transcription.py b/lib/processors/three_tier_transcription.py
@@ -37,6 +37,7 @@ def process_transcript_to_three_tier(
             "speakers": self._extract_speakers(transcripts),
             "legislation": self._extract_legislation(transcripts),
         }
+        used_sentence_ids: set[str] = set()
 
         for paragraph in paragraphs:
             para_dict = paragraph.to_dict()
@@ -48,16 +49,18 @@ def process_transcript_to_three_tier(
             three_tier_data["paragraphs"].append(para_dict)
 
             sentences = split_paragraph_into_sentences(
-                paragraph, youtube_video_id, video_date, video_title
+                paragraph,
+                youtube_video_id,
+                video_date,
+                video_title,
+                existing_sentence_ids=used_sentence_ids,
             )
 
             three_tier_data["sentences"].extend(sentences)
 
         return three_tier_data
 
-    def _extract_speakers(
-        self, transcripts: list[dict[str, Any]]
-    ) -> list[dict[str, Any]]:
+    def _extract_speakers(self, transcripts: list[dict[str, Any]]) -> list[dict[str, Any]]:
         """Extract unique speakers from transcripts."""
         speakers_map: dict[str, dict[str, Any]] = {}
 
@@ -91,9 +94,7 @@ def speaker_id_to_base_name(speaker_id: str) -> str:
         speakers_list = list(speakers_map.values())
         return sorted(speakers_list, key=lambda x: x.get("first_appearance", ""))
 
-    def _extract_legislation(
-        self, transcripts: list[dict[str, Any]]
-    ) -> list[dict[str, Any]]:
+    def _extract_legislation(self, transcripts: list[dict[str, Any]]) -> list[dict[str, Any]]:
         """Extract legislation mentions from transcripts."""
         leg_map: dict[str, dict[str, Any]] = {}
 
@@ -122,9 +123,7 @@ def _extract_legislation(
                         }
                     leg_map[name]["mentions"] += 1
 
-        legislation_list = sorted(
-            leg_map.values(), key=lambda x: x["mentions"], reverse=True
-        )
+        legislation_list = sorted(leg_map.values(), key=lambda x: x["mentions"], reverse=True)
 
         return legislation_list
 
diff --git a/lib/transcripts/ingestor.py b/lib/transcripts/ingestor.py
@@ -5,7 +5,7 @@
 
 from lib.db.postgres_client import PostgresClient
 from lib.embeddings.google_client import GoogleEmbeddingClient
-from lib.id_generators import generate_entity_id
+from lib.id_generators import generate_bill_id, generate_entity_id
 from lib.roles import infer_role_kind, normalize_person_name, normalize_role_label
 from lib.processors.paragraph_splitter import (
     group_transcripts_into_paragraphs,
@@ -61,36 +61,36 @@ def ingest_transcript_json(
         speakers = transcript_data.get("speakers", []) or []
         for s in speakers:
             self._upsert_speaker(s)
-            self._upsert_speaker_video_roles_for_video(
-                s, youtube_video_id=youtube_video_id
-            )
+            self._upsert_speaker_video_roles_for_video(s, youtube_video_id=youtube_video_id)
+
+        legislation = transcript_data.get("legislation", []) or []
+        for item in legislation:
+            self._upsert_bill_from_legislation(item)
 
         transcripts = transcript_data.get("transcripts", []) or []
         paragraphs = group_transcripts_into_paragraphs(youtube_video_id, transcripts)
         paragraph_texts = [p.get_text() for p in paragraphs]
 
         paragraph_embeddings: list[list[float]] = []
         if embed_paragraphs and paragraph_texts:
-            paragraph_embeddings = self.embedding_client.generate_embeddings_batch(
-                paragraph_texts
-            )
+            paragraph_embeddings = self.embedding_client.generate_embeddings_batch(paragraph_texts)
 
         sentence_entities_count = 0
         paragraph_entities_count = 0
         entity_ids_seen: set[str] = set()
         entity_texts_by_id: dict[str, tuple[str, str]] = {}
+        used_sentence_ids: set[str] = set()
 
         for idx, paragraph in enumerate(paragraphs):
             emb = paragraph_embeddings[idx] if idx < len(paragraph_embeddings) else None
-            self._insert_paragraph(
-                paragraph, title=title, upload_date=upload_date, embedding=emb
-            )
+            self._insert_paragraph(paragraph, title=title, upload_date=upload_date, embedding=emb)
 
             sentences = split_paragraph_into_sentences(
                 paragraph,
                 youtube_video_id=youtube_video_id,
                 video_date=upload_date,
                 video_title=title,
+                existing_sentence_ids=used_sentence_ids,
             )
             paragraph_entity_ids: set[str] = set()
 
@@ -391,6 +391,37 @@ def _insert_sentence(self, sentence: dict[str, Any]) -> None:
             ),
         )
 
+    def _upsert_bill_from_legislation(self, legislation_item: dict[str, Any]) -> None:
+        bill_name = str(legislation_item.get("name") or "").strip()
+        if not bill_name:
+            return
+
+        bill_id_raw = str(legislation_item.get("id") or "").strip()
+        bill_id = bill_id_raw or generate_bill_id(bill_name)
+        description = str(legislation_item.get("description") or "").strip()
+        source = str(legislation_item.get("source") or "").strip()
+
+        self.postgres.execute_update(
+            """
+            INSERT INTO bills (id, bill_number, title, description, status, source_text)
+            VALUES (%s, %s, %s, %s, %s, %s)
+            ON CONFLICT (id) DO UPDATE SET
+                bill_number = EXCLUDED.bill_number,
+                title = EXCLUDED.title,
+                description = EXCLUDED.description,
+                source_text = EXCLUDED.source_text,
+                updated_at = NOW()
+            """,
+            (
+                bill_id,
+                bill_name,
+                bill_name,
+                description,
+                "",
+                source,
+            ),
+        )
+
     def _extract_entities_from_text(self, text: str) -> list[tuple[str, str]]:
         return []
 
@@ -407,9 +438,7 @@ def _upsert_entity(self, entity_id: str, text: str, entity_type: str) -> None:
             (entity_id, text, entity_type),
         )
 
-    def _insert_sentence_entity(
-        self, sentence_id: str, entity_id: str, entity_type: str
-    ) -> None:
+    def _insert_sentence_entity(self, sentence_id: str, entity_id: str, entity_type: str) -> None:
         self.postgres.execute_update(
             """
             INSERT INTO sentence_entities (sentence_id, entity_id, entity_type, relationship_type)
diff --git a/scripts/migrate_transcripts.py b/scripts/migrate_transcripts.py
@@ -119,6 +119,7 @@ def migrate_paragraphs_and_sentences(
 
     paragraph_texts = [p.get_text() for p in paragraphs]
     paragraph_embeddings = embedding_client.generate_embeddings_batch(paragraph_texts)
+    used_sentence_ids: set[str] = set()
 
     for i, paragraph in enumerate(paragraphs):
         embedding = paragraph_embeddings[i]
@@ -152,7 +153,13 @@ def migrate_paragraphs_and_sentences(
         )
 
     for paragraph in paragraphs:
-        sentences = split_paragraph_into_sentences(paragraph, video_id, video_date, video_title)
+        sentences = split_paragraph_into_sentences(
+            paragraph,
+            video_id,
+            video_date,
+            video_title,
+            existing_sentence_ids=used_sentence_ids,
+        )
 
         for sentence in sentences:
             postgres_client.execute_update(
diff --git a/tests/test_chat_agent_v2_unit.py b/tests/test_chat_agent_v2_unit.py
@@ -111,6 +111,23 @@ def test_merge_cite_utterance_ids_should_resolve_utt_seconds_to_unique_known_id(
     assert got == ["AEOFDga2dh8:10848"]
 
 
+def test_merge_cite_utterance_ids_should_resolve_unique_seconds_with_suffix_ids() -> None:
+    retrieval = {
+        "citations": [
+            {"utterance_id": "AEOFDga2dh8:10848_2"},
+            {"utterance_id": "otherVid:220"},
+        ]
+    }
+
+    got = _merge_cite_utterance_ids(
+        answer="Education Bill [cite](#src:utt_10848)",
+        cite_utterance_ids=[],
+        retrieval=retrieval,
+    )
+
+    assert got == ["AEOFDga2dh8:10848_2"]
+
+
 def test_merge_cite_utterance_ids_should_keep_well_formed_unknown_ids_for_db_fallback() -> None:
     retrieval = {
         "citations": [
diff --git a/tests/test_paragraph_splitter.py b/tests/test_paragraph_splitter.py
@@ -115,6 +115,31 @@ def test_split_paragraph_to_sentences(sample_transcripts):
     print("✅ Paragraph to sentence splitting works")
 
 
+def test_split_paragraph_to_sentences_should_avoid_id_collisions_for_same_second():
+    """Sentence IDs should be unique even when timestamps collide."""
+    transcripts = [
+        {
+            "start": "00:00:10",
+            "text": "First sentence at second ten.",
+            "voice_id": 1,
+            "speaker_id": "s_speaker_1",
+        },
+        {
+            "start": "00:00:10",
+            "text": "Second sentence also at second ten.",
+            "voice_id": 1,
+            "speaker_id": "s_speaker_1",
+        },
+    ]
+
+    paragraphs = group_transcripts_into_paragraphs("video123", transcripts)
+    sentences = split_paragraph_into_sentences(paragraphs[0], "video123")
+
+    assert len(sentences) == 2
+    assert sentences[0]["id"] == "video123:10"
+    assert sentences[1]["id"] == "video123:10_2"
+
+
 def test_empty_transcripts():
     """Test handling of empty transcript list."""
     paragraphs = group_transcripts_into_paragraphs("Syxyah7QIaM", [])
diff --git a/tests/test_transcript_ingestion_unit.py b/tests/test_transcript_ingestion_unit.py
@@ -86,5 +86,54 @@ def test_transcript_ingestor_upserts_speaker_video_roles() -> None:
 
     ingestor.ingest_transcript_json(transcript_data, youtube_video_id="test_video")
 
-    query = postgres.execute_update.call_args_list[2][0][0]
-    assert "INSERT INTO speaker_video_roles" in query
+    role_queries = [
+        call.args[0]
+        for call in postgres.execute_update.call_args_list
+        if "INSERT INTO speaker_video_roles" in call.args[0]
+    ]
+    assert len(role_queries) >= 1
+
+
+def test_transcript_ingestor_should_upsert_legislation_into_bills() -> None:
+    postgres = Mock()
+    embeddings = Mock()
+    embeddings.generate_embeddings_batch.return_value = [[0.0] * 768]
+
+    ingestor = TranscriptIngestor(
+        postgres=postgres,
+        embedding_client=embeddings,
+    )
+
+    transcript_data = {
+        "video_metadata": {
+            "title": "Test Video",
+            "upload_date": "20260106",
+            "duration": "0:01:00",
+        },
+        "speakers": [],
+        "transcripts": [
+            {
+                "start": "00:00:10",
+                "text": "The Road Traffic Bill is important.",
+                "voice_id": 1,
+                "speaker_id": "s_speaker_1",
+            }
+        ],
+        "legislation": [
+            {
+                "id": "L_ROAD_TRAFFIC_BILL_1",
+                "name": "Road Traffic Bill",
+                "description": "Modernizes road traffic penalties",
+                "source": "audio",
+            }
+        ],
+    }
+
+    ingestor.ingest_transcript_json(transcript_data, youtube_video_id="test_video")
+
+    bill_queries = [
+        call.args[0]
+        for call in postgres.execute_update.call_args_list
+        if "INSERT INTO bills" in call.args[0]
+    ]
+    assert len(bill_queries) == 1
diff --git a/transcribe.py b/transcribe.py