Implement castchat: AI-powered podcast exploration (Phases 1-4)

Copilot · crossjam · Copilot · commit c83331ba15f0 · 2026-01-24T22:40:32.000Z
Co-authored-by: crossjam &lt;208062+crossjam@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,6 +90,12 @@ transcription-full = [
     "retrocast[transcription-mlx,transcription-cuda,transcription-diarization]",
 ]
 
+# AI-powered chat interface for exploring transcribed podcast content
+castchat = [
+    "chromadb>=0.5.23",
+    "pydantic-ai>=0.0.14",
+]
+
 [project.scripts]
 retrocast = "retrocast.cli:cli"
 
diff --git a/src/retrocast/castchat_agent.py b/src/retrocast/castchat_agent.py
@@ -0,0 +1,178 @@
+"""PydanticAI agent for interactive podcast transcript exploration."""
+
+from typing import Any
+
+from loguru import logger
+from pydantic_ai import Agent, RunContext
+from pydantic_ai.models.anthropic import AnthropicModel
+
+from retrocast.chromadb_manager import ChromaDBManager
+
+
+def create_castchat_agent(
+    chroma_manager: ChromaDBManager, model_name: str = "claude-sonnet-4-20250514"
+) -> Agent:
+    """Create a PydanticAI agent for exploring podcast transcripts.
+
+    Args:
+        chroma_manager: ChromaDB manager instance for searching transcripts
+        model_name: Anthropic model name to use
+
+    Returns:
+        Configured PydanticAI Agent
+    """
+    model = AnthropicModel(model_name)
+
+    # Create agent with system prompt
+    agent = Agent(
+        model=model,
+        system_prompt=(
+            "You are an AI assistant helping users explore their podcast archive. "
+            "You have access to transcribed podcast episodes and can search through "
+            "them to answer questions about topics, guests, discussions, and specific "
+            "content mentioned across episodes. When searching, provide context about "
+            "which podcast and episode the information came from, and include timestamps "
+            "when relevant. Be conversational and helpful."
+        ),
+    )
+
+    @agent.tool
+    def search_transcripts(ctx: RunContext[Any], query: str, max_results: int = 5) -> str:
+        """Search podcast transcription segments for relevant content.
+
+        Use this tool to find information in podcast transcripts. It performs
+        semantic search across all transcribed episodes to find relevant segments
+        based on the query.
+
+        Args:
+            ctx: Run context (automatically provided)
+            query: The search query describing what to look for
+            max_results: Maximum number of results to return (default 5, max 10)
+
+        Returns:
+            Formatted string with search results including episode info and timestamps
+        """
+        logger.debug(f"Searching transcripts with query: {query}")
+
+        # Limit max_results
+        max_results = min(max_results, 10)
+
+        try:
+            results = chroma_manager.search(query, n_results=max_results)
+
+            if not results:
+                return "No relevant segments found in the transcription archive."
+
+            # Format results for the agent
+            formatted = "Found relevant segments:\n\n"
+            for i, result in enumerate(results, 1):
+                metadata = result["metadata"]
+                text = result["text"]
+
+                # Format timestamp
+                start_time = metadata.get("start_time", 0)
+                minutes = int(start_time // 60)
+                seconds = int(start_time % 60)
+                timestamp = f"{minutes}:{seconds:02d}"
+
+                # Build result entry
+                formatted += f"{i}. **{metadata.get('podcast_title', 'Unknown Podcast')}**\n"
+                formatted += f"   Episode: {metadata.get('episode_title', 'Unknown Episode')}\n"
+                formatted += f"   Time: {timestamp}"
+
+                speaker = metadata.get("speaker")
+                if speaker:
+                    formatted += f" | Speaker: {speaker}"
+
+                formatted += f"\n   > {text}\n\n"
+
+            return formatted
+
+        except Exception as e:
+            logger.error(f"Error searching transcripts: {e}")
+            return f"Error searching transcripts: {str(e)}"
+
+    @agent.tool
+    def search_podcast(
+        ctx: RunContext[Any], podcast_title: str, query: str, max_results: int = 5
+    ) -> str:
+        """Search transcripts within a specific podcast.
+
+        Use this tool when the user asks about a specific podcast by name.
+        This filters search results to only that podcast.
+
+        Args:
+            ctx: Run context (automatically provided)
+            podcast_title: The name of the podcast to search within
+            query: The search query describing what to look for
+            max_results: Maximum number of results to return (default 5, max 10)
+
+        Returns:
+            Formatted string with search results for that podcast
+        """
+        logger.debug(f"Searching podcast '{podcast_title}' with query: {query}")
+
+        max_results = min(max_results, 10)
+
+        try:
+            results = chroma_manager.search(
+                query, n_results=max_results, podcast_filter=podcast_title
+            )
+
+            if not results:
+                return (
+                    f"No relevant segments found in '{podcast_title}'. "
+                    "The podcast might not be in the archive or might not match exactly."
+                )
+
+            # Format results
+            formatted = f"Found segments in **{podcast_title}**:\n\n"
+            for i, result in enumerate(results, 1):
+                metadata = result["metadata"]
+                text = result["text"]
+
+                start_time = metadata.get("start_time", 0)
+                minutes = int(start_time // 60)
+                seconds = int(start_time % 60)
+                timestamp = f"{minutes}:{seconds:02d}"
+
+                formatted += f"{i}. Episode: {metadata.get('episode_title', 'Unknown')}\n"
+                formatted += f"   Time: {timestamp}"
+
+                speaker = metadata.get("speaker")
+                if speaker:
+                    formatted += f" | Speaker: {speaker}"
+
+                formatted += f"\n   > {text}\n\n"
+
+            return formatted
+
+        except Exception as e:
+            logger.error(f"Error searching podcast transcripts: {e}")
+            return f"Error searching podcast: {str(e)}"
+
+    @agent.tool
+    def get_collection_info(ctx: RunContext[Any]) -> str:
+        """Get information about the indexed transcript collection.
+
+        Use this tool when users ask about what's available in their archive
+        or how many episodes have been transcribed.
+
+        Args:
+            ctx: Run context (automatically provided)
+
+        Returns:
+            Summary of indexed content
+        """
+        try:
+            count = chroma_manager.get_collection_count()
+            return (
+                f"The transcript archive contains {count:,} indexed segments "
+                "from transcribed podcast episodes. You can search across all "
+                "of them or filter by specific podcast titles."
+            )
+        except Exception as e:
+            logger.error(f"Error getting collection info: {e}")
+            return f"Error accessing collection info: {str(e)}"
+
+    return agent
diff --git a/src/retrocast/chromadb_manager.py b/src/retrocast/chromadb_manager.py
@@ -0,0 +1,179 @@
+"""ChromaDB integration for podcast transcription search and RAG."""
+
+from pathlib import Path
+from typing import Any, cast
+
+import chromadb
+from chromadb.config import Settings
+from loguru import logger
+
+from retrocast.datastore import Datastore
+
+
+class ChromaDBManager:
+    """Manages ChromaDB collections for transcription segment indexing."""
+
+    def __init__(self, persist_directory: Path):
+        """Initialize ChromaDB client with persistent storage.
+
+        Args:
+            persist_directory: Directory path for ChromaDB persistence
+        """
+        self.persist_directory = persist_directory
+        self.persist_directory.mkdir(parents=True, exist_ok=True)
+
+        logger.debug(f"Initializing ChromaDB at {persist_directory}")
+        self.client = chromadb.PersistentClient(
+            path=str(persist_directory),
+            settings=Settings(
+                anonymized_telemetry=False,
+                allow_reset=True,
+            ),
+        )
+        self.collection_name = "transcription_segments"
+        self.collection = self.client.get_or_create_collection(
+            name=self.collection_name,
+            metadata={"description": "Podcast transcription segments with timestamps"},
+        )
+
+    def index_transcriptions(self, datastore: Datastore, batch_size: int = 100) -> int:
+        """Index all transcription segments from the database into ChromaDB.
+
+        Args:
+            datastore: Datastore instance for querying transcription data
+            batch_size: Number of segments to process per batch
+
+        Returns:
+            Number of segments indexed
+        """
+        logger.info("Starting transcription indexing...")
+
+        # Query all transcription segments with metadata
+        query = """
+            SELECT
+                ts.transcription_id,
+                ts.segment_index,
+                ts.start_time,
+                ts.end_time,
+                ts.text,
+                ts.speaker,
+                t.podcast_title,
+                t.episode_title,
+                t.episode_url,
+                t.media_path,
+                t.language,
+                t.backend,
+                t.model_size
+            FROM transcription_segments ts
+            JOIN transcriptions t ON ts.transcription_id = t.transcription_id
+            ORDER BY ts.transcription_id, ts.segment_index
+        """
+
+        segments = list(datastore.db.execute(query).fetchall())
+        total_segments = len(segments)
+
+        if total_segments == 0:
+            logger.warning("No transcription segments found in database")
+            return 0
+
+        logger.info(f"Found {total_segments} segments to index")
+
+        # Process in batches
+        indexed_count = 0
+        for i in range(0, total_segments, batch_size):
+            batch = segments[i : i + batch_size]
+            documents = []
+            metadatas = []
+            ids = []
+
+            for segment in batch:
+                # Create unique ID for each segment
+                segment_id = f"t{segment[0]}_s{segment[1]}"
+                ids.append(segment_id)
+
+                # The text to be embedded and searched
+                documents.append(segment[4])  # text column
+
+                # Metadata for context and filtering
+                metadatas.append(
+                    {
+                        "transcription_id": str(segment[0]),
+                        "segment_index": str(segment[1]),
+                        "start_time": float(segment[2]),
+                        "end_time": float(segment[3]),
+                        "speaker": str(segment[5] or ""),
+                        "podcast_title": str(segment[6] or ""),
+                        "episode_title": str(segment[7] or ""),
+                        "episode_url": str(segment[8] or ""),
+                        "media_path": str(segment[9] or ""),
+                        "language": str(segment[10] or ""),
+                        "backend": str(segment[11] or ""),
+                        "model_size": str(segment[12] or ""),
+                    }
+                )
+
+            # Add batch to collection
+            self.collection.add(documents=documents, metadatas=cast(Any, metadatas), ids=ids)
+            indexed_count += len(batch)
+
+            logger.debug(f"Indexed {indexed_count}/{total_segments} segments")
+
+        logger.info(f"Successfully indexed {indexed_count} segments")
+        return indexed_count
+
+    def search(
+        self, query: str, n_results: int = 5, podcast_filter: str | None = None
+    ) -> list[dict[str, Any]]:
+        """Search transcription segments using semantic similarity.
+
+        Args:
+            query: The search query text
+            n_results: Maximum number of results to return
+            podcast_filter: Optional podcast title to filter results
+
+        Returns:
+            List of matching segments with metadata
+        """
+        where_filter: Any = None
+        if podcast_filter:
+            where_filter = {"podcast_title": {"$eq": podcast_filter}}
+
+        results = self.collection.query(
+            query_texts=[query], n_results=n_results, where=where_filter
+        )
+
+        # Format results for easier consumption
+        formatted_results = []
+        if results["documents"] and results["documents"][0]:
+            for i, doc in enumerate(results["documents"][0]):
+                metadata = results["metadatas"][0][i] if results["metadatas"] else {}
+                distance = results["distances"][0][i] if results["distances"] else None
+
+                formatted_results.append(
+                    {
+                        "text": doc,
+                        "metadata": metadata,
+                        "distance": distance,
+                        "id": results["ids"][0][i] if results["ids"] else None,
+                    }
+                )
+
+        return formatted_results
+
+    def get_collection_count(self) -> int:
+        """Get the number of segments in the collection.
+
+        Returns:
+            Number of indexed segments
+        """
+        return self.collection.count()
+
+    def reset(self) -> None:
+        """Clear all data from the collection."""
+        logger.warning(f"Resetting collection '{self.collection_name}'")
+        self.client.delete_collection(name=self.collection_name)
+        self.collection = self.client.get_or_create_collection(
+            name=self.collection_name,
+            metadata={"description": "Podcast transcription segments with timestamps"},
+        )
+        logger.info("Collection reset complete")
diff --git a/src/retrocast/cli.py b/src/retrocast/cli.py