Skip to content

Commit a414960

Browse files
committed
fix: stabilize transcript identity and preserve ingestion context
1 parent f6ec9c2 commit a414960

File tree

9 files changed

+237
-87
lines changed

9 files changed

+237
-87
lines changed

lib/chat_agent_v2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def _looks_like_utterance_id(citation_id: str) -> bool:
114114
return False
115115
if normalized.startswith("utt_"):
116116
normalized = normalized[4:]
117-
return re.match(r"^[A-Za-z0-9_-]+:\d+$", normalized) is not None
117+
return re.match(r"^[A-Za-z0-9_-]+:\d+(?:_\d+)?$", normalized) is not None
118118

119119

120120
def _merge_cite_utterance_ids(
@@ -136,7 +136,7 @@ def _merge_cite_utterance_ids(
136136

137137
suffix_counts: dict[str, int] = {}
138138
for known_id in known_ids:
139-
match = re.search(r":(\d+)$", known_id)
139+
match = re.search(r":(\d+)(?:_\d+)?$", known_id)
140140
if not match:
141141
continue
142142
seconds = match.group(1)
@@ -166,7 +166,7 @@ def _merge_cite_utterance_ids(
166166
(
167167
known_id
168168
for known_id in known_ids
169-
if known_id.endswith(f":{seconds}")
169+
if re.search(rf":{re.escape(seconds)}(?:_\d+)?$", known_id)
170170
),
171171
None,
172172
)

lib/processors/paragraph_splitter.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,21 @@ def split_paragraph_into_sentences(
8383
youtube_video_id: str,
8484
video_date: str | None = None,
8585
video_title: str | None = None,
86+
existing_sentence_ids: set[str] | None = None,
8687
) -> list[dict[str, Any]]:
8788
"""Split a paragraph into individual sentences with IDs."""
8889
sentences = []
90+
seen_ids = existing_sentence_ids if existing_sentence_ids is not None else set()
8991

9092
for i, entry in enumerate(paragraph.sentences):
9193
start_seconds = parse_timestamp_to_seconds(entry["start"])
92-
sentence_id = f"{youtube_video_id}:{start_seconds}"
94+
base_sentence_id = f"{youtube_video_id}:{start_seconds}"
95+
sentence_id = base_sentence_id
96+
suffix = 2
97+
while sentence_id in seen_ids:
98+
sentence_id = f"{base_sentence_id}_{suffix}"
99+
suffix += 1
100+
seen_ids.add(sentence_id)
93101

94102
sentences.append(
95103
{

lib/processors/three_tier_transcription.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def process_transcript_to_three_tier(
3737
"speakers": self._extract_speakers(transcripts),
3838
"legislation": self._extract_legislation(transcripts),
3939
}
40+
used_sentence_ids: set[str] = set()
4041

4142
for paragraph in paragraphs:
4243
para_dict = paragraph.to_dict()
@@ -48,16 +49,18 @@ def process_transcript_to_three_tier(
4849
three_tier_data["paragraphs"].append(para_dict)
4950

5051
sentences = split_paragraph_into_sentences(
51-
paragraph, youtube_video_id, video_date, video_title
52+
paragraph,
53+
youtube_video_id,
54+
video_date,
55+
video_title,
56+
existing_sentence_ids=used_sentence_ids,
5257
)
5358

5459
three_tier_data["sentences"].extend(sentences)
5560

5661
return three_tier_data
5762

58-
def _extract_speakers(
59-
self, transcripts: list[dict[str, Any]]
60-
) -> list[dict[str, Any]]:
63+
def _extract_speakers(self, transcripts: list[dict[str, Any]]) -> list[dict[str, Any]]:
6164
"""Extract unique speakers from transcripts."""
6265
speakers_map: dict[str, dict[str, Any]] = {}
6366

@@ -91,9 +94,7 @@ def speaker_id_to_base_name(speaker_id: str) -> str:
9194
speakers_list = list(speakers_map.values())
9295
return sorted(speakers_list, key=lambda x: x.get("first_appearance", ""))
9396

94-
def _extract_legislation(
95-
self, transcripts: list[dict[str, Any]]
96-
) -> list[dict[str, Any]]:
97+
def _extract_legislation(self, transcripts: list[dict[str, Any]]) -> list[dict[str, Any]]:
9798
"""Extract legislation mentions from transcripts."""
9899
leg_map: dict[str, dict[str, Any]] = {}
99100

@@ -122,9 +123,7 @@ def _extract_legislation(
122123
}
123124
leg_map[name]["mentions"] += 1
124125

125-
legislation_list = sorted(
126-
leg_map.values(), key=lambda x: x["mentions"], reverse=True
127-
)
126+
legislation_list = sorted(leg_map.values(), key=lambda x: x["mentions"], reverse=True)
128127

129128
return legislation_list
130129

lib/transcripts/ingestor.py

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from lib.db.postgres_client import PostgresClient
77
from lib.embeddings.google_client import GoogleEmbeddingClient
8-
from lib.id_generators import generate_entity_id
8+
from lib.id_generators import generate_bill_id, generate_entity_id
99
from lib.roles import infer_role_kind, normalize_person_name, normalize_role_label
1010
from lib.processors.paragraph_splitter import (
1111
group_transcripts_into_paragraphs,
@@ -61,36 +61,36 @@ def ingest_transcript_json(
6161
speakers = transcript_data.get("speakers", []) or []
6262
for s in speakers:
6363
self._upsert_speaker(s)
64-
self._upsert_speaker_video_roles_for_video(
65-
s, youtube_video_id=youtube_video_id
66-
)
64+
self._upsert_speaker_video_roles_for_video(s, youtube_video_id=youtube_video_id)
65+
66+
legislation = transcript_data.get("legislation", []) or []
67+
for item in legislation:
68+
self._upsert_bill_from_legislation(item)
6769

6870
transcripts = transcript_data.get("transcripts", []) or []
6971
paragraphs = group_transcripts_into_paragraphs(youtube_video_id, transcripts)
7072
paragraph_texts = [p.get_text() for p in paragraphs]
7173

7274
paragraph_embeddings: list[list[float]] = []
7375
if embed_paragraphs and paragraph_texts:
74-
paragraph_embeddings = self.embedding_client.generate_embeddings_batch(
75-
paragraph_texts
76-
)
76+
paragraph_embeddings = self.embedding_client.generate_embeddings_batch(paragraph_texts)
7777

7878
sentence_entities_count = 0
7979
paragraph_entities_count = 0
8080
entity_ids_seen: set[str] = set()
8181
entity_texts_by_id: dict[str, tuple[str, str]] = {}
82+
used_sentence_ids: set[str] = set()
8283

8384
for idx, paragraph in enumerate(paragraphs):
8485
emb = paragraph_embeddings[idx] if idx < len(paragraph_embeddings) else None
85-
self._insert_paragraph(
86-
paragraph, title=title, upload_date=upload_date, embedding=emb
87-
)
86+
self._insert_paragraph(paragraph, title=title, upload_date=upload_date, embedding=emb)
8887

8988
sentences = split_paragraph_into_sentences(
9089
paragraph,
9190
youtube_video_id=youtube_video_id,
9291
video_date=upload_date,
9392
video_title=title,
93+
existing_sentence_ids=used_sentence_ids,
9494
)
9595
paragraph_entity_ids: set[str] = set()
9696

@@ -391,6 +391,37 @@ def _insert_sentence(self, sentence: dict[str, Any]) -> None:
391391
),
392392
)
393393

394+
def _upsert_bill_from_legislation(self, legislation_item: dict[str, Any]) -> None:
395+
bill_name = str(legislation_item.get("name") or "").strip()
396+
if not bill_name:
397+
return
398+
399+
bill_id_raw = str(legislation_item.get("id") or "").strip()
400+
bill_id = bill_id_raw or generate_bill_id(bill_name)
401+
description = str(legislation_item.get("description") or "").strip()
402+
source = str(legislation_item.get("source") or "").strip()
403+
404+
self.postgres.execute_update(
405+
"""
406+
INSERT INTO bills (id, bill_number, title, description, status, source_text)
407+
VALUES (%s, %s, %s, %s, %s, %s)
408+
ON CONFLICT (id) DO UPDATE SET
409+
bill_number = EXCLUDED.bill_number,
410+
title = EXCLUDED.title,
411+
description = EXCLUDED.description,
412+
source_text = EXCLUDED.source_text,
413+
updated_at = NOW()
414+
""",
415+
(
416+
bill_id,
417+
bill_name,
418+
bill_name,
419+
description,
420+
"",
421+
source,
422+
),
423+
)
424+
394425
def _extract_entities_from_text(self, text: str) -> list[tuple[str, str]]:
395426
return []
396427

@@ -407,9 +438,7 @@ def _upsert_entity(self, entity_id: str, text: str, entity_type: str) -> None:
407438
(entity_id, text, entity_type),
408439
)
409440

410-
def _insert_sentence_entity(
411-
self, sentence_id: str, entity_id: str, entity_type: str
412-
) -> None:
441+
def _insert_sentence_entity(self, sentence_id: str, entity_id: str, entity_type: str) -> None:
413442
self.postgres.execute_update(
414443
"""
415444
INSERT INTO sentence_entities (sentence_id, entity_id, entity_type, relationship_type)

scripts/migrate_transcripts.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def migrate_paragraphs_and_sentences(
119119

120120
paragraph_texts = [p.get_text() for p in paragraphs]
121121
paragraph_embeddings = embedding_client.generate_embeddings_batch(paragraph_texts)
122+
used_sentence_ids: set[str] = set()
122123

123124
for i, paragraph in enumerate(paragraphs):
124125
embedding = paragraph_embeddings[i]
@@ -152,7 +153,13 @@ def migrate_paragraphs_and_sentences(
152153
)
153154

154155
for paragraph in paragraphs:
155-
sentences = split_paragraph_into_sentences(paragraph, video_id, video_date, video_title)
156+
sentences = split_paragraph_into_sentences(
157+
paragraph,
158+
video_id,
159+
video_date,
160+
video_title,
161+
existing_sentence_ids=used_sentence_ids,
162+
)
156163

157164
for sentence in sentences:
158165
postgres_client.execute_update(

tests/test_chat_agent_v2_unit.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,23 @@ def test_merge_cite_utterance_ids_should_resolve_utt_seconds_to_unique_known_id(
111111
assert got == ["AEOFDga2dh8:10848"]
112112

113113

114+
def test_merge_cite_utterance_ids_should_resolve_unique_seconds_with_suffix_ids() -> None:
115+
retrieval = {
116+
"citations": [
117+
{"utterance_id": "AEOFDga2dh8:10848_2"},
118+
{"utterance_id": "otherVid:220"},
119+
]
120+
}
121+
122+
got = _merge_cite_utterance_ids(
123+
answer="Education Bill [cite](#src:utt_10848)",
124+
cite_utterance_ids=[],
125+
retrieval=retrieval,
126+
)
127+
128+
assert got == ["AEOFDga2dh8:10848_2"]
129+
130+
114131
def test_merge_cite_utterance_ids_should_keep_well_formed_unknown_ids_for_db_fallback() -> None:
115132
retrieval = {
116133
"citations": [

tests/test_paragraph_splitter.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,31 @@ def test_split_paragraph_to_sentences(sample_transcripts):
115115
print("✅ Paragraph to sentence splitting works")
116116

117117

118+
def test_split_paragraph_to_sentences_should_avoid_id_collisions_for_same_second():
119+
"""Sentence IDs should be unique even when timestamps collide."""
120+
transcripts = [
121+
{
122+
"start": "00:00:10",
123+
"text": "First sentence at second ten.",
124+
"voice_id": 1,
125+
"speaker_id": "s_speaker_1",
126+
},
127+
{
128+
"start": "00:00:10",
129+
"text": "Second sentence also at second ten.",
130+
"voice_id": 1,
131+
"speaker_id": "s_speaker_1",
132+
},
133+
]
134+
135+
paragraphs = group_transcripts_into_paragraphs("video123", transcripts)
136+
sentences = split_paragraph_into_sentences(paragraphs[0], "video123")
137+
138+
assert len(sentences) == 2
139+
assert sentences[0]["id"] == "video123:10"
140+
assert sentences[1]["id"] == "video123:10_2"
141+
142+
118143
def test_empty_transcripts():
119144
"""Test handling of empty transcript list."""
120145
paragraphs = group_transcripts_into_paragraphs("Syxyah7QIaM", [])

tests/test_transcript_ingestion_unit.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,5 +86,54 @@ def test_transcript_ingestor_upserts_speaker_video_roles() -> None:
8686

8787
ingestor.ingest_transcript_json(transcript_data, youtube_video_id="test_video")
8888

89-
query = postgres.execute_update.call_args_list[2][0][0]
90-
assert "INSERT INTO speaker_video_roles" in query
89+
role_queries = [
90+
call.args[0]
91+
for call in postgres.execute_update.call_args_list
92+
if "INSERT INTO speaker_video_roles" in call.args[0]
93+
]
94+
assert len(role_queries) >= 1
95+
96+
97+
def test_transcript_ingestor_should_upsert_legislation_into_bills() -> None:
98+
postgres = Mock()
99+
embeddings = Mock()
100+
embeddings.generate_embeddings_batch.return_value = [[0.0] * 768]
101+
102+
ingestor = TranscriptIngestor(
103+
postgres=postgres,
104+
embedding_client=embeddings,
105+
)
106+
107+
transcript_data = {
108+
"video_metadata": {
109+
"title": "Test Video",
110+
"upload_date": "20260106",
111+
"duration": "0:01:00",
112+
},
113+
"speakers": [],
114+
"transcripts": [
115+
{
116+
"start": "00:00:10",
117+
"text": "The Road Traffic Bill is important.",
118+
"voice_id": 1,
119+
"speaker_id": "s_speaker_1",
120+
}
121+
],
122+
"legislation": [
123+
{
124+
"id": "L_ROAD_TRAFFIC_BILL_1",
125+
"name": "Road Traffic Bill",
126+
"description": "Modernizes road traffic penalties",
127+
"source": "audio",
128+
}
129+
],
130+
}
131+
132+
ingestor.ingest_transcript_json(transcript_data, youtube_video_id="test_video")
133+
134+
bill_queries = [
135+
call.args[0]
136+
for call in postgres.execute_update.call_args_list
137+
if "INSERT INTO bills" in call.args[0]
138+
]
139+
assert len(bill_queries) == 1

0 commit comments

Comments
 (0)