55
66from lib .db .postgres_client import PostgresClient
77from lib .embeddings .google_client import GoogleEmbeddingClient
8- from lib .id_generators import generate_entity_id
8+ from lib .id_generators import generate_bill_id , generate_entity_id
99from lib .roles import infer_role_kind , normalize_person_name , normalize_role_label
1010from lib .processors .paragraph_splitter import (
1111 group_transcripts_into_paragraphs ,
@@ -61,36 +61,36 @@ def ingest_transcript_json(
6161 speakers = transcript_data .get ("speakers" , []) or []
6262 for s in speakers :
6363 self ._upsert_speaker (s )
64- self ._upsert_speaker_video_roles_for_video (
65- s , youtube_video_id = youtube_video_id
66- )
64+ self ._upsert_speaker_video_roles_for_video (s , youtube_video_id = youtube_video_id )
65+
66+ legislation = transcript_data .get ("legislation" , []) or []
67+ for item in legislation :
68+ self ._upsert_bill_from_legislation (item )
6769
6870 transcripts = transcript_data .get ("transcripts" , []) or []
6971 paragraphs = group_transcripts_into_paragraphs (youtube_video_id , transcripts )
7072 paragraph_texts = [p .get_text () for p in paragraphs ]
7173
7274 paragraph_embeddings : list [list [float ]] = []
7375 if embed_paragraphs and paragraph_texts :
74- paragraph_embeddings = self .embedding_client .generate_embeddings_batch (
75- paragraph_texts
76- )
76+ paragraph_embeddings = self .embedding_client .generate_embeddings_batch (paragraph_texts )
7777
7878 sentence_entities_count = 0
7979 paragraph_entities_count = 0
8080 entity_ids_seen : set [str ] = set ()
8181 entity_texts_by_id : dict [str , tuple [str , str ]] = {}
82+ used_sentence_ids : set [str ] = set ()
8283
8384 for idx , paragraph in enumerate (paragraphs ):
8485 emb = paragraph_embeddings [idx ] if idx < len (paragraph_embeddings ) else None
85- self ._insert_paragraph (
86- paragraph , title = title , upload_date = upload_date , embedding = emb
87- )
86+ self ._insert_paragraph (paragraph , title = title , upload_date = upload_date , embedding = emb )
8887
8988 sentences = split_paragraph_into_sentences (
9089 paragraph ,
9190 youtube_video_id = youtube_video_id ,
9291 video_date = upload_date ,
9392 video_title = title ,
93+ existing_sentence_ids = used_sentence_ids ,
9494 )
9595 paragraph_entity_ids : set [str ] = set ()
9696
@@ -391,6 +391,37 @@ def _insert_sentence(self, sentence: dict[str, Any]) -> None:
391391 ),
392392 )
393393
394+ def _upsert_bill_from_legislation (self , legislation_item : dict [str , Any ]) -> None :
395+ bill_name = str (legislation_item .get ("name" ) or "" ).strip ()
396+ if not bill_name :
397+ return
398+
399+ bill_id_raw = str (legislation_item .get ("id" ) or "" ).strip ()
400+ bill_id = bill_id_raw or generate_bill_id (bill_name )
401+ description = str (legislation_item .get ("description" ) or "" ).strip ()
402+ source = str (legislation_item .get ("source" ) or "" ).strip ()
403+
404+ self .postgres .execute_update (
405+ """
406+ INSERT INTO bills (id, bill_number, title, description, status, source_text)
407+ VALUES (%s, %s, %s, %s, %s, %s)
408+ ON CONFLICT (id) DO UPDATE SET
409+ bill_number = EXCLUDED.bill_number,
410+ title = EXCLUDED.title,
411+ description = EXCLUDED.description,
412+ source_text = EXCLUDED.source_text,
413+ updated_at = NOW()
414+ """ ,
415+ (
416+ bill_id ,
417+ bill_name ,
418+ bill_name ,
419+ description ,
420+ "" ,
421+ source ,
422+ ),
423+ )
424+
394425 def _extract_entities_from_text (self , text : str ) -> list [tuple [str , str ]]:
395426 return []
396427
@@ -407,9 +438,7 @@ def _upsert_entity(self, entity_id: str, text: str, entity_type: str) -> None:
407438 (entity_id , text , entity_type ),
408439 )
409440
410- def _insert_sentence_entity (
411- self , sentence_id : str , entity_id : str , entity_type : str
412- ) -> None :
441+ def _insert_sentence_entity (self , sentence_id : str , entity_id : str , entity_type : str ) -> None :
413442 self .postgres .execute_update (
414443 """
415444 INSERT INTO sentence_entities (sentence_id, entity_id, entity_type, relationship_type)
0 commit comments