From afd2300ddccc8cea1335edff737d228e0c9a369f Mon Sep 17 00:00:00 2001
From: Tharsis Souza <souza.tharsis@gmail.com>
Date: Thu, 10 Oct 2024 22:08:30 -0300
Subject: [PATCH 01/49] small steps

---
 README.md | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)
diff --git a/README.md b/README.md
index bcea4e01..345f92fd 100644
--- a/README.md
+++ b/README.md
@@ -26,12 +26,12 @@ This sample collection is also [available at audio.com](https://audio.com/thatup
 
 ## Features ✨
 
-- Generate engaging, AI-powered conversational content from multiple sources (websites, youtube, and PDFs)
-- Customize transcript and audio generation tailored to your needs  (e.g. style, structure, length)
-- Generate podcasts from pre-existing/edited transcripts
-- Support for multiple advanced text-to-speech models (OpenAI and ElevenLabs) for natural-sounding audio
-- Seamlessly integrate CLI and Python package for streamlined automated workflows
-- Support for multiple languages, enabling global content creation (very experimental, currently!)
+- Generate AI-powered conversational content from multiple sources (websites, YouTube, and PDFs)
+- Customizable transcript and audio generation (e.g. style, language, structure, length)
+- Create podcasts from pre-existing or edited transcripts
+- Support for advanced text-to-speech models (OpenAI and ElevenLabs)
+- Seamless CLI and Python package integration for automated workflows
+- Multi-language support for global content creation (experimental!)
 
 ## Updates 🚀
 
@@ -42,21 +42,16 @@ This sample collection is also [available at audio.com](https://audio.com/thatup
 
 ## Quickstart 💻
 
-### Setup
-Before installing, ensure you have Python 3.11 or higher installed on your system.
+### Prerequisites
+- Python 3.11 or higher
+- `$ pip install ffmpeg` (for audio processing)
 
+### Installation
 1. Install from PyPI
-
   `$ pip install podcastfy`
 
 2. Set up your [API keys](usage/config.md)
 
-3. Ensure you have ffmpeg installed on your system, required for audio processing
-```
-sudo apt update
-sudo apt install ffmpeg
-```
-
 ### Python
 ```python
 from podcastfy.client import generate_podcast
@@ -74,7 +69,7 @@ python -m podcastfy.client --url <url1> --url <url2>
 
 - [CLI](usage/cli.md)
 
-Try [HuggingFace 🤗 space app](https://huggingface.co/spaces/thatupiso/Podcastfy.ai_demo) for a simple use case (URLs -> Audio). WARNING: This UI App was not as thoroughly tested as the Python package.
+Experience Podcastfy with our [HuggingFace](https://huggingface.co/spaces/thatupiso/Podcastfy.ai_demo) 🤗 Spaces app for a simple URL-to-Audio demo. (Note: This UI app is less extensively tested than the Python package.)
 
 ## Customization 🔧
 
@@ -82,7 +77,7 @@ Podcastfy offers a range of [Conversation Customization](usage/conversation_cust
 
 ## Contributing 🤝
 
-Contributions are welcome! Please feel free to submit an [Issue](https://github.com/souzatharsis/podcastfy/issues) or a Pull Request. But even more excitingly feel free to fork the repo and create your own app! I am curious about your use cases! Please let me know if I could be of help.
+We welcome contributions! Please submit [Issues](https://github.com/souzatharsis/podcastfy/issues) or Pull Requests. Feel free to fork the repo and create your own applications. We're excited to learn about your use cases!
 
 ## Example Use Cases 🎧🎶
 

From fa67e7fb538c232e64701e4a7b9b0f98e5482635 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Sun, 13 Oct 2024 17:09:46 +0200
Subject: [PATCH 02/49] small steps

---
 podcastfy/character.py         |  37 +++
 podcastfy/content_generator.py |  37 ++-
 podcastfy/core/podcast.py      | 458 +++++++++++++++++++++++++++++++++
 podcastfy/tts_backends.py      |  62 +++++
 4 files changed, 588 insertions(+), 6 deletions(-)
 create mode 100644 podcastfy/character.py
 create mode 100644 podcastfy/core/podcast.py
 create mode 100644 podcastfy/tts_backends.py

diff --git a/podcastfy/character.py b/podcastfy/character.py
new file mode 100644
index 00000000..92419bf9
--- /dev/null
+++ b/podcastfy/character.py
@@ -0,0 +1,37 @@
+from typing import Dict, Any, Optional
+
+from pydantic import BaseModel
+
+
+class TTSConfig(BaseModel):
+    voice: str
+    backend: str
+    extra_args: Dict[str, Any]
+
+class Character:
+    """Represents a character in the podcast."""
+
+    def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""):
+        # note: in the future the last two arguments are not optional
+        self.name = name
+        self.role = role
+        self.tts_configs = tts_configs
+        self.default_description_for_llm = default_description_for_llm
+        self.preferred_tts = next(iter(tts_configs.keys()))  # Set first TTS as default
+
+    def set_preferred_tts(self, tts_name: str):
+        if tts_name not in self.tts_configs:
+            raise ValueError(f"TTS backend '{tts_name}' not configured for this character")
+        self.preferred_tts = tts_name
+
+    def to_prompt(self) -> str:
+        """Convert the character information to a prompt for the LLM."""
+        return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}"
+
+    def get_tts_args(self, tts_name: Optional[str] = None) -> Dict[str, Any]:
+        """Get the TTS arguments for this character."""
+        tts_name = tts_name or self.preferred_tts
+        tts_config = self.tts_configs[tts_name]
+        return {
+            "voice": tts_config["voice"],
+            **tts_config["extra_args"]}
diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
index 6f2c7498..e9796c9f 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/content_generator.py
@@ -7,13 +7,16 @@
 """
 
 import os
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 
 #from langchain_google_vertexai import ChatVertexAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain import hub
+
+from podcastfy.character import Character
+from podcastfy.core.podcast import LLMBackend
 from podcastfy.utils.config_conversation import load_conversation_config
 from podcastfy.utils.config import load_config
 import logging
@@ -51,7 +54,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 		
 		self.chain = (self.prompt_template | self.llm | self.parser)
 
-	def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None) -> str:
+	def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str:
 		"""
 		Generate Q&A content based on input texts.
 
@@ -65,6 +68,7 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] =
 		Raises:
 			Exception: If there's an error in generating content.
 		"""
+		assert len(characters) == 2, "The number of characters should be 2 for this implementation"
 		try:
 			
 			
@@ -72,8 +76,8 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] =
 				"input_text": input_texts,
 				"word_count": self.config_conversation.get('word_count'),
 				"conversation_style": ", ".join(self.config_conversation.get('conversation_style', [])),
-				"roles_person1": self.config_conversation.get('roles_person1'),
-				"roles_person2": self.config_conversation.get('roles_person2'),
+				"roles_person1": characters[0].role,
+				"roles_person2": characters[1].role,
 				"dialogue_structure": ", ".join(self.config_conversation.get('dialogue_structure', [])),
 				"podcast_name": self.config_conversation.get('podcast_name'),
 				"podcast_tagline": self.config_conversation.get('podcast_tagline'),
@@ -95,6 +99,22 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] =
 			logger.error(f"Error generating content: {str(e)}")
 			raise
 
+
+class DefaultPodcastifyTranscriptEngine(LLMBackend):
+	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None):
+		"""
+		Initialize the DefaultPodcastifyTranscriptEngine.
+
+		Args:
+			api_key (str): API key for Google's Generative AI.
+			conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
+		"""
+		self.content_generator = ContentGenerator(api_key, conversation_config)
+
+	def generate_text(self, input_text: str, characters: List[Character]) -> str:
+		return self.content_generator.generate_qa_content(input_text, output_filepath=None, characters=characters)
+
+
 def main(seed: int = 42) -> None:
 	"""
 	Generate Q&A content based on input text from input_text.txt using the Gemini API.
@@ -115,7 +135,7 @@ def main(seed: int = 42) -> None:
 			raise ValueError("GEMINI_API_KEY not found in configuration")
 
 		# Initialize ContentGenerator
-		content_generator = ContentGenerator(api_key)
+		content_generator = DefaultPodcastifyTranscriptEngine(api_key)
 
 		# Read input text from file
 		input_text = ""
@@ -126,7 +146,12 @@ def main(seed: int = 42) -> None:
 					input_text += file.read() + "\n\n"
 
 		# Generate Q&A content
-		response = content_generator.generate_qa_content(input_text)
+		config_conv = load_conversation_config()
+		characters = [
+			Character(name="Speaker 1", role=config_conv.get('roles_person1')),
+			Character(name="Speaker 2", role=config_conv.get('roles_person2')),
+		]
+		response = content_generator.generate_text(input_text, characters)
 
 		# Print the generated Q&A content
 		print("Generated Q&A Content:")
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
new file mode 100644
index 00000000..646fbdbf
--- /dev/null
+++ b/podcastfy/core/podcast.py
@@ -0,0 +1,458 @@
+import logging
+from abc import ABC, abstractmethod
+from enum import Enum
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, Type, NamedTuple
+from pydub import AudioSegment as PydubAudioSegment
+from functools import wraps
+import asyncio
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import contextmanager
+
+from podcastfy.character import Character, TTSConfig
+
+class PodcastState(Enum):
+    """Enum representing the different states of a podcast during creation."""
+    INITIALIZED = 0  # Initial state when the Podcast object is created
+    TRANSCRIPT_BUILT = 1  # State after the transcript has been generated
+    AUDIO_SEGMENTS_BUILT = 2  # State after individual audio segments have been created
+    STITCHED = 3  # Final state after all audio segments have been combined
+
+
+class LLMBackend(ABC):
+    """Abstract base class for Language Model backends."""
+
+    @abstractmethod
+    def generate_text(self, prompt: str, characters: List['Character']) -> List[Tuple[Character, str]]:
+        """
+        Generate text based on a given prompt.
+
+        Args:
+            prompt (str): The input prompt for text generation.
+
+        Returns:
+            List[Tuple[str, str]]: A list of tuples containing speaker and text.
+        """
+        pass
+
+
+class SyncTTSBackend(ABC):
+    """Protocol for synchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    def text_to_speech(self, text: str, character: 'Character') -> Path:
+        """
+        Convert text to speech synchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+
+        Returns:
+            Path: Path to the generated audio file.
+        """
+        pass
+
+
+class AsyncTTSBackend(ABC):
+    """Protocol for asynchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    async def async_text_to_speech(self, text: str, character: 'Character') -> Path:
+        """
+        Convert text to speech asynchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+
+        Returns:
+            Path: Path to the generated audio file.
+        """
+        pass
+
+
+class TranscriptSegment:
+    """Represents a segment of the podcast transcript."""
+
+    def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None):
+        self.text = text
+        self.speaker = speaker
+        self.tts_args = tts_args or {}
+
+
+class Transcript:
+    """Represents the full transcript of a podcast."""
+
+    def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]):
+        self.segments = segments
+        self.metadata = metadata
+
+    def save(self, filepath: str, format: str = "plaintext"):
+        """Save the transcript to a file."""
+        with open(filepath, 'w') as f:
+            f.write(str(self))
+
+    def __str__(self) -> str:
+        """Convert the transcript to a string representation."""
+        lines = []
+        for segment in self.segments:
+            lines.append(f"{segment.speaker.name}: {segment.text}")
+
+        metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()])
+
+        return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines)
+
+
+class AudioSegment:
+    """Represents an audio segment of the podcast."""
+
+    def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None):
+        self.filepath = filepath
+        self.length_ms = length_ms
+        self.transcript_segment = transcript_segment
+        self._audio: Optional[PydubAudioSegment] = None
+
+    @property
+    def audio(self) -> PydubAudioSegment:
+        """Lazy-load the audio segment."""
+        if self._audio is None:
+            self._audio = PydubAudioSegment.from_file(self.filepath)
+            if len(self._audio) != self.length_ms:
+                raise ValueError(
+                    f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)")
+        return self._audio
+
+
+class AudioManager:
+    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1):
+        self.tts_backends = tts_backends
+        self.n_jobs = n_jobs
+        self.audio_segments = []
+        self.final_audio = None
+
+    async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        async def process_segment(segment: TranscriptSegment):
+            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
+            audio_file = await tts_backend.async_text_to_speech(segment.text, segment.speaker)
+            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment)
+
+        semaphore = asyncio.Semaphore(self.n_jobs)
+
+        async def bounded_process_segment(segment):
+            async with semaphore:
+                return await process_segment(segment)
+
+        tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments]
+        return await asyncio.gather(*tasks)
+
+    def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        def process_segment(segment: TranscriptSegment):
+            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
+            audio_file = tts_backend.text_to_speech(segment.text, segment.speaker)
+            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment)
+
+        with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
+            return list(executor.map(process_segment, transcript.segments))
+
+    def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        if any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()):
+            return asyncio.run(self._async_build_audio_segments(transcript))
+        else:
+            return self._sync_build_audio_segments(transcript)
+
+    def stitch_audio_segments(self):
+        self.final_audio = sum([segment.audio for segment in self.audio_segments])
+
+
+def podcast_stage(func):
+    """Decorator to manage podcast stage transitions."""
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        current_method = self._next_stage_methods[self.state]
+        if current_method != func and not self._reworking:
+            print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.")
+            return
+
+        try:
+            result = func(self, *args, **kwargs)
+            next_state = next((state for state, method in self._next_stage_methods.items() if method == func), None)
+            self.state = next_state or self.state
+            return result
+        except Exception as e:
+            print(f"Error in {func.__name__}: {str(e)}")
+            raise
+
+    return wrapper
+
+
+class Podcast:
+    """Main class for podcast creation and management."""
+
+    def __init__(self, content: str, llm_backend: LLMBackend,
+                 tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]],
+                 characters: List[Character], default_tts_n_jobs: int = 1):
+        """
+        Initialize a new Podcast instance.
+
+        Args:
+            content (str): The raw content to be processed into a podcast.
+            llm_backend (LLMBackend): The language model backend for generating the transcript.
+            tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends.
+            characters (List[Character]): List of characters participating in the podcast.
+            default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing.
+                Defaults to 1.
+
+        Raises:
+            ValueError: If a character's preferred TTS backend is not available.
+        """
+        self.content = content
+        self.llm_backend = llm_backend
+        self.tts_backends = {backend.name: backend for backend in tts_backends}
+        self.characters = {char.name: char for char in characters}
+        self.default_tts_n_jobs = default_tts_n_jobs
+        self.state = PodcastState.INITIALIZED
+        self._reworking = False
+        self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs)
+
+        # Initialize attributes with null values
+        self.transcript = None
+        self.audio_segments = []
+        self.audio = None
+
+        # Define the sequence of methods to be called for each stage
+        self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = {
+            PodcastState.INITIALIZED: self.build_transcript,
+            PodcastState.TRANSCRIPT_BUILT: self.build_audio_segments,
+            PodcastState.AUDIO_SEGMENTS_BUILT: self.stitch_audio_segments,
+        }
+
+    @classmethod
+    def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript],
+                        tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], characters: List[Character],
+                        default_tts_n_jobs: int = 1) -> 'Podcast':
+        """
+        Create a Podcast instance from a pre-existing transcript.
+
+        Args:
+            transcript (Union[Sequence[Tuple[str, str]], Transcript]): Pre-existing transcript.
+            tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends.
+            characters (List[Character]): List of characters participating in the podcast.
+            default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing.
+                Defaults to 1.
+
+        Returns:
+            Podcast: A new Podcast instance with the transcript built and ready for audio generation.
+        """
+        podcast = cls("", None, list(tts_backends.values()), characters, default_tts_n_jobs=default_tts_n_jobs)
+        if isinstance(transcript, Transcript):
+            podcast.transcript = transcript
+        else:
+            raise ValueError("Transcript must be a Transcript instance")  # unimplemented
+        podcast.state = PodcastState.TRANSCRIPT_BUILT
+        return podcast
+
+    def reset_to_state(self, state: PodcastState):
+        """Reset the podcast to a specific state."""
+        self.state = state
+        self.transcript = None if state.value < PodcastState.TRANSCRIPT_BUILT.value else self.transcript
+        self.audio_segments = [] if state.value < PodcastState.AUDIO_SEGMENTS_BUILT.value else self.audio_segments
+        self.audio = None if state.value < PodcastState.STITCHED.value else self.audio
+
+    @contextmanager
+    def rework(self, target_state: PodcastState, auto_finalize: bool = True):
+        """Context manager for reworking the podcast from a specific state."""
+        original_state = self.state
+        self._reworking = True
+
+        if target_state.value < self.state.value:
+            print(f"Rewinding from {self.state.name} to {target_state.name}")
+            self.reset_to_state(target_state)
+
+        try:
+            yield
+        finally:
+            self._reworking = False
+            if self.state.value < original_state.value:
+                print(
+                    f"Warning: Podcast is now in an earlier state ({self.state.name}) than before reworking ({original_state.name}). You may want to call finalize() to rebuild.")
+                if auto_finalize:
+                    self.finalize()
+
+    @podcast_stage
+    def build_transcript(self) -> None:
+        """Build the podcast transcript using the LLM backend."""
+        character_prompts = "\n\n".join([char.to_prompt() for char in self.characters.values()])
+        full_prompt = f"{self.content}\n\nCharacters:\n{character_prompts}"
+        generated_segments = self.llm_backend.generate_text(full_prompt, list(self.characters.values()))
+
+        segments = [TranscriptSegment(text, speaker, self.characters[speaker])
+                    for speaker, text in generated_segments if speaker in self.characters]
+
+        self.transcript = Transcript(segments, {"source": "Generated content"})
+
+    @podcast_stage
+    def build_audio_segments(self, n_jobs: Optional[int] = None) -> None:
+        """Build audio segments from the transcript."""
+        self.audio_segments = self.audio_manager.create_audio_segments(self.transcript)
+
+    @podcast_stage
+    def stitch_audio_segments(self) -> None:
+        """Stitch all audio segments together to form the final podcast audio."""
+        self.audio = sum([segment.audio for segment in self.audio_segments])
+
+    def _build_next_stage(self) -> bool:
+        """Build the next stage of the podcast."""
+        if self.state == PodcastState.STITCHED:
+            return False
+
+        next_method = self._next_stage_methods[self.state]
+        next_method()
+        return True
+
+    def finalize(self) -> None:
+        """Finalize the podcast by building all remaining stages."""
+        while self._build_next_stage():
+            pass
+
+    def save(self, filepath: str) -> None:
+        """Save the finalized podcast audio to a file."""
+        if self.state != PodcastState.STITCHED:
+            raise ValueError("Podcast can only be saved after audio is stitched")
+
+        if self.audio:
+            self.audio.export(filepath, format="mp3")
+        else:
+            raise ValueError("No stitched audio to save")
+
+    def save_transcript(self, filepath: str, format: str = "plaintext") -> None:
+        """Save the podcast transcript to a file."""
+        if self.state < PodcastState.TRANSCRIPT_BUILT:
+            raise ValueError("Transcript can only be saved after it is built")
+
+        if self.transcript:
+            self.transcript.save(filepath, format)
+        else:
+            raise ValueError("No transcript to save")
+
+
+# Usage example: Step-by-step podcast creation
+if __name__ == "__main__":
+    from tempfile import NamedTemporaryFile
+
+
+    class DummyLLMBackend(LLMBackend):
+        def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[str, str]]:
+            return [("Host", "Welcome to our podcast!"), ("Guest", "Thanks for having me!")]
+
+
+    class DummyTTSBackend(SyncTTSBackend):
+        def __init__(self, name: str):
+            self.name = name
+
+        def text_to_speech(self, text: str, character: Character) -> Path:
+            with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
+                PydubAudioSegment.silent(duration=1000).export(temp_file.name, format="mp3")
+            return Path(temp_file.name)
+
+
+    # Define TTS backends
+    openai_tts = DummyTTSBackend("openai")
+    elevenlabs_tts = DummyTTSBackend("elevenlabs")
+
+    # Define TTS backends
+
+    # Define characters
+    host = Character(
+        name="Host",
+        role="Podcast host",
+        tts_configs={
+            "openai": {"voice": "en-US-Neural2-F", "backend": "openai", "extra_args": {"speaking_rate": 1.0}},
+            "elevenlabs": {"voice": "Rachel", "backend": "elevenlabs", "extra_args": {"stability": 0.5}}
+        },
+        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly."
+    )
+    guest = Character(
+        name="Guest",
+        role="Expert guest",
+        tts_configs={"openai": {"voice": "en-US-Neural2-D", "backend": "openai", "extra_args": {"pitch": -2.0}},
+                     "elevenlabs": {"voice": "Antoni", "backend": "elevenlabs", "extra_args": {"stability": 0.8}}},
+        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner."
+    )
+
+    # Initialize the podcast
+    podcast = Podcast(
+        content="""
+        This is a sample content for our podcast.
+        It includes information from multiple sources that have already been parsed.
+        """,
+        llm_backend=DummyLLMBackend(),
+        tts_backends=[openai_tts, elevenlabs_tts],
+        characters=[host, guest],
+    )
+    print(f"Initial state: {podcast.state}")
+
+    # Step 1: Build transcript
+    podcast.build_transcript()
+    print(f"After building transcript: {podcast.state}")
+    print(f"Transcript: {podcast.transcript}")
+
+    # Step 2: Build audio segments
+    podcast.build_audio_segments()
+    print(f"After building audio segments: {podcast.state}")
+    print(f"Number of audio segments: {len(podcast.audio_segments)}")
+
+    # Step 3: Stitch audio segments
+    podcast.stitch_audio_segments()
+    print(f"After stitching audio: {podcast.state}")
+
+    # Rework example: modify the transcript and rebuild (auto_finalize is True by default)
+    with podcast.rework(PodcastState.TRANSCRIPT_BUILT):
+        print(f"Inside rework context, state: {podcast.state}")
+        podcast.transcript.segments.append(
+            TranscriptSegment("This is a new segment", "Host", podcast.characters["Host"]))
+        print("Added new segment to transcript")
+
+        # Rebuild audio segments and stitch
+        podcast.build_audio_segments()
+
+    print(f"After rework: {podcast.state}")
+
+    # Add a new audio segment (auto_finalize is True by default)
+    with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
+        PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3")
+
+    with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT):
+        new_segment = AudioSegment(Path(temp_file.name), 500,
+                                   TranscriptSegment("New audio segment", "Host", podcast.characters["Host"]))
+        podcast.audio_segments.insert(0, new_segment)
+
+    # Save the final podcast
+    podcast.save("./final.mp3")
+    podcast.save_transcript("./final.txt", format="plaintext")
+    print("Saved podcast and transcript")
+
+    # Example with pre-existing transcript using from_transcript class method
+    pre_existing_transcript = [
+        ("Host", "Welcome to our podcast created from a pre-existing transcript!"),
+        ("Guest", "Thank you for having me. I'm excited to be here.")
+    ]
+
+    podcast_from_transcript = Podcast.from_transcript(
+        transcript=pre_existing_transcript,
+        tts_backends=[openai_tts, elevenlabs_tts],
+        characters=[host, guest]
+    )
+
+    print(f"Podcast created from transcript initial state: {podcast_from_transcript.state}")
+    print(f"Transcript: {podcast_from_transcript.transcript}")
+
+    # Finalize the podcast (this will skip transcript generation and move directly to audio generation)
+    podcast_from_transcript.finalize()
+    podcast_from_transcript.save("./from_transcript.mp3")
+    print("Saved podcast created from transcript")
diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py
new file mode 100644
index 00000000..08a02e42
--- /dev/null
+++ b/podcastfy/tts_backends.py
@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, Any
+from podcastfy.character import Character
+
+class TTSBackend(ABC):
+    @abstractmethod
+    def text_to_speech(self, text: str, character: Character) -> Path:
+        """
+        Convert text to speech.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+
+        Returns:
+            Path: Path to the generated audio file.
+        """
+        pass
+
+class ElevenLabsTTS(TTSBackend):
+    def __init__(self, api_key: str, config: Dict[str, Any]):
+        self.api_key = api_key
+        self.config = config
+
+    def text_to_speech(self, text: str, character: Character) -> Path:
+        # Placeholder for ElevenLabs TTS implementation
+        voice = character.get_tts_args('elevenlabs').get('voice', self.config['default_voice'])
+        
+        print(f"ElevenLabs TTS: Converting text to speech for character {character.name} with voice {voice}")
+        
+        # In a real implementation, this would call the ElevenLabs API and return the path to the generated audio file
+        return Path(f"/tmp/{character.name}_audio.mp3")
+
+class OpenAITTS(TTSBackend):
+    def __init__(self, api_key: str, config: Dict[str, Any]):
+        self.api_key = api_key
+        self.config = config
+
+    def text_to_speech(self, text: str, character: Character) -> Path:
+        # Placeholder for OpenAI TTS implementation
+        voice = character.get_tts_args('openai').get('voice', self.config['default_voice'])
+        
+        print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {voice}")
+        
+        # In a real implementation, this would call the OpenAI API and return the path to the generated audio file
+        return Path(f"/tmp/{character.name}_audio.mp3")
+
+# Example usage:
+if __name__ == "__main__":
+    from podcastfy.utils.config import load_config
+    
+    config = load_config()
+    elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {}))
+    openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {}))
+    
+    dummy_character = Character("John", "host", {
+        'elevenlabs': {'voice': 'en-US-JohnNeural'},
+        'openai': {'voice': 'en-US-Neural2-C'}
+    }, "A friendly podcast host")
+    
+    elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character)

From 36bb5e9d49af3759aec0c7b6fc50d9562d00b677 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Sun, 13 Oct 2024 23:42:10 +0200
Subject: [PATCH 03/49] some progress but not yet

---
 podcastfy/character.py    |  15 ++---
 podcastfy/tts_backends.py | 135 +++++++++++++++++++++++++++++---------
 2 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/podcastfy/character.py b/podcastfy/character.py
index 92419bf9..f225ae4e 100644
--- a/podcastfy/character.py
+++ b/podcastfy/character.py
@@ -2,17 +2,17 @@
 
 from pydantic import BaseModel
 
-
-class TTSConfig(BaseModel):
+class VoiceConfig(BaseModel):
     voice: str
-    backend: str
     extra_args: Dict[str, Any]
 
+class TTSConfig(VoiceConfig):
+    backend: str
+
 class Character:
     """Represents a character in the podcast."""
 
     def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""):
-        # note: in the future the last two arguments are not optional
         self.name = name
         self.role = role
         self.tts_configs = tts_configs
@@ -28,10 +28,7 @@ def to_prompt(self) -> str:
         """Convert the character information to a prompt for the LLM."""
         return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}"
 
-    def get_tts_args(self, tts_name: Optional[str] = None) -> Dict[str, Any]:
+    def get_tts_args(self, tts_name: Optional[str] = None) -> TTSConfig:
         """Get the TTS arguments for this character."""
         tts_name = tts_name or self.preferred_tts
-        tts_config = self.tts_configs[tts_name]
-        return {
-            "voice": tts_config["voice"],
-            **tts_config["extra_args"]}
+        return self.tts_configs[tts_name]
diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py
index 08a02e42..dc53859a 100644
--- a/podcastfy/tts_backends.py
+++ b/podcastfy/tts_backends.py
@@ -1,62 +1,135 @@
+import os
+import uuid
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, Any
-from podcastfy.character import Character
+from tempfile import TemporaryFile, TemporaryDirectory
+from typing import Dict, Any, List, ClassVar
+import asyncio
+
+import openai
+
+from podcastfy.character import Character, VoiceConfig
+import edge_tts
+from elevenlabs import client as elevenlabs_client
 
 class TTSBackend(ABC):
+    name: ClassVar[str] = ""
+    default_voices: ClassVar[List[VoiceConfig]] = []
+
+    @classmethod
+    def set_default_voices(cls, voices: List[VoiceConfig]):
+        """
+        Set the default voices for the TTS backend.
+        """
+        cls.default_voices = voices
+
     @abstractmethod
-    def text_to_speech(self, text: str, character: Character) -> Path:
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
         """
         Convert text to speech.
 
         Args:
             text (str): The text to convert to speech.
             character (Character): The character for which to generate speech.
+            output_path (Path): The path where the audio file should be saved.
 
         Returns:
-            Path: Path to the generated audio file.
+            Path: Path to the generated audio file (same as output_path).
         """
         pass
 
 class ElevenLabsTTS(TTSBackend):
-    def __init__(self, api_key: str, config: Dict[str, Any]):
-        self.api_key = api_key
-        self.config = config
-
-    def text_to_speech(self, text: str, character: Character) -> Path:
-        # Placeholder for ElevenLabs TTS implementation
-        voice = character.get_tts_args('elevenlabs').get('voice', self.config['default_voice'])
-        
-        print(f"ElevenLabs TTS: Converting text to speech for character {character.name} with voice {voice}")
-        
-        # In a real implementation, this would call the ElevenLabs API and return the path to the generated audio file
-        return Path(f"/tmp/{character.name}_audio.mp3")
+    name: str = "elevenlabs"
+
+    def __init__(self, api_key: str = None):
+        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        # TODO, would be nicer to get a filepath directly from the client
+        config = character.get_tts_args('elevenlabs')
+        client = elevenlabs_client.ElevenLabs(api_key=self.api_key)  # # client could be reused
+        content = client.generate(
+            text=text,
+            voice=config.voice,
+            model=config.extra_args.get('model', 'default')
+        )
+        with open(output_path, "wb") as out:
+            for chunk in content:
+                if chunk:
+                    out.write(chunk)
+        return output_path
 
 class OpenAITTS(TTSBackend):
-    def __init__(self, api_key: str, config: Dict[str, Any]):
-        self.api_key = api_key
-        self.config = config
+    name: str = "openai"
+    def __init__(self, api_key: str):
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+
+    def ensure_characters_tts_config_is_valid(self, character:Character) -> None:
+        # TODO: maybe that should be in the ABC class
+        tts_config = character.tts_configs.get('openai')
+        if not tts_config:
+            raise ValueError(f"Character '{character.name}' does not have OpenAI TTS configuration")
+        # ensure there is a key model in the extra_args
+        if 'model' not in tts_config.extra_args:
+            raise ValueError(f"Character '{character.name}' does not have the 'model' key in the OpenAI TTS configuration")
 
-    def text_to_speech(self, text: str, character: Character) -> Path:
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        # TODO, would be nicer to get a filepath directly from the client. If not given takes tempdir from the config ?
+        self.ensure_characters_tts_config_is_valid(character)
         # Placeholder for OpenAI TTS implementation
-        voice = character.get_tts_args('openai').get('voice', self.config['default_voice'])
-        
-        print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {voice}")
-        
-        # In a real implementation, this would call the OpenAI API and return the path to the generated audio file
-        return Path(f"/tmp/{character.name}_audio.mp3")
+        config = character.get_tts_args('openai')
+
+        print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice}")
+        response = openai.audio.speech.create(
+            model=config.extra_args["model"],
+            voice=config.voice,
+            input=text
+        )
+        with open(output_path, "wb") as file:
+            file.write(response.content)
+        return output_path
+
+class EdgeTTS(TTSBackend):
+    name: str = "edge-tts"
+
+
+    def __init__(self):
+        pass
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        config = character.get_tts_args('edge-tts')
+
+        async def edge_tts_conversion(text: str, output_path: str, voice: str):
+            communicate = edge_tts.Communicate(text, voice)
+            await communicate.save(output_path)
+
+        asyncio.run(edge_tts_conversion(text, str(output_path), config.voice))
+
+        return output_path
+
+
+    def ensure_characters_tts_config_is_valid(self, character: Character) -> None:
+        tts_config = character.tts_configs.get('edge-tts')
+        if not tts_config:
+            raise ValueError(f"Character '{character.name}' does not have Edge TTS configuration")
 
 # Example usage:
 if __name__ == "__main__":
     from podcastfy.utils.config import load_config
-    
+
     config = load_config()
     elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {}))
     openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {}))
-    
+    # edge_tts = EdgeTTS()
+
     dummy_character = Character("John", "host", {
         'elevenlabs': {'voice': 'en-US-JohnNeural'},
-        'openai': {'voice': 'en-US-Neural2-C'}
+        'openai': {'voice': 'en-US-Neural2-C'},
+        'edge-tts': {'voice': 'en-US-ChristopherNeural'}
     }, "A friendly podcast host")
-    
-    elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character)
+
+    output_dir = Path("output")
+    output_dir.mkdir(exist_ok=True)
+    output_path = output_dir / f"{dummy_character.name}_{uuid.uuid4().hex}.mp3"
+    elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character, output_path)

From 386c9fc21cbf3c427fe06218feefb6d4a88a500f Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Mon, 14 Oct 2024 12:34:20 +0200
Subject: [PATCH 04/49] update

---
 podcastfy/aiengines/__init__.py               |   0
 podcastfy/aiengines/llm/base.py               |  22 ++
 .../llm/legacy_gemini_langchain.py}           |  26 +-
 podcastfy/aiengines/tts/base.py               | 115 +++++++
 podcastfy/aiengines/tts/tts_backends.py       |  94 ++++++
 podcastfy/client_v2.py                        | 239 ++++++++++++++
 podcastfy/core/__init__.py                    |   0
 podcastfy/core/audio.py                       |  91 ++++++
 podcastfy/{ => core}/character.py             |  16 +-
 podcastfy/core/podcast.py                     | 298 ++++++------------
 podcastfy/core/transcript.py                  |  95 ++++++
 podcastfy/core/tts_configs.py                 |  12 +
 podcastfy/tts_backends.py                     | 135 --------
 13 files changed, 796 insertions(+), 347 deletions(-)
 create mode 100644 podcastfy/aiengines/__init__.py
 create mode 100644 podcastfy/aiengines/llm/base.py
 rename podcastfy/{content_generator.py => aiengines/llm/legacy_gemini_langchain.py} (87%)
 create mode 100644 podcastfy/aiengines/tts/base.py
 create mode 100644 podcastfy/aiengines/tts/tts_backends.py
 create mode 100644 podcastfy/client_v2.py
 create mode 100644 podcastfy/core/__init__.py
 create mode 100644 podcastfy/core/audio.py
 rename podcastfy/{ => core}/character.py (73%)
 create mode 100644 podcastfy/core/transcript.py
 create mode 100644 podcastfy/core/tts_configs.py
 delete mode 100644 podcastfy/tts_backends.py

diff --git a/podcastfy/aiengines/__init__.py b/podcastfy/aiengines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/podcastfy/aiengines/llm/base.py b/podcastfy/aiengines/llm/base.py
new file mode 100644
index 00000000..fe96dcb3
--- /dev/null
+++ b/podcastfy/aiengines/llm/base.py
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+
+from podcastfy.core.character import Character
+
+
+class LLMBackend(ABC):
+    """Abstract base class for Language Model backends."""
+    # TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings)
+
+    @abstractmethod
+    def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
+        """
+        Generate text based on a given prompt.
+
+        Args:
+            prompt (str): The input prompt for text generation.
+
+        Returns:
+            List[Tuple[Character, str]]: A list of tuples containing speaker and text.
+        """
+        pass
diff --git a/podcastfy/content_generator.py b/podcastfy/aiengines/llm/legacy_gemini_langchain.py
similarity index 87%
rename from podcastfy/content_generator.py
rename to podcastfy/aiengines/llm/legacy_gemini_langchain.py
index e9796c9f..4e08b0af 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/aiengines/llm/legacy_gemini_langchain.py
@@ -7,16 +7,14 @@
 """
 
 import os
-from typing import Optional, Dict, Any, List
+from typing import Optional, Dict, Any, List, Tuple
 
-#from langchain_google_vertexai import ChatVertexAI
 from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain import hub
 
-from podcastfy.character import Character
-from podcastfy.core.podcast import LLMBackend
+from podcastfy.core.character import Character
+from podcastfy.aiengines.llm.base import LLMBackend
 from podcastfy.utils.config_conversation import load_conversation_config
 from podcastfy.utils.config import load_config
 import logging
@@ -111,8 +109,20 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 		"""
 		self.content_generator = ContentGenerator(api_key, conversation_config)
 
-	def generate_text(self, input_text: str, characters: List[Character]) -> str:
-		return self.content_generator.generate_qa_content(input_text, output_filepath=None, characters=characters)
+	def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
+		content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
+		
+		# Parse the generated content into the required format
+		transcript = []
+		for line in content.split('\n'):
+			if ':' in line:
+				speaker_name, text = line.split(':', 1)
+				speaker = next((char for char in characters if char.name == speaker_name.strip()), None)
+				if speaker:
+					transcript.append((speaker, text.strip()))
+		
+		return transcript
+
 
 
 def main(seed: int = 42) -> None:
@@ -151,7 +161,7 @@ def main(seed: int = 42) -> None:
 			Character(name="Speaker 1", role=config_conv.get('roles_person1')),
 			Character(name="Speaker 2", role=config_conv.get('roles_person2')),
 		]
-		response = content_generator.generate_text(input_text, characters)
+		response = content_generator.generate_transcript(input_text, characters)
 
 		# Print the generated Q&A content
 		print("Generated Q&A Content:")
diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
new file mode 100644
index 00000000..7b88c290
--- /dev/null
+++ b/podcastfy/aiengines/tts/base.py
@@ -0,0 +1,115 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, Any, List
+
+import yaml
+
+from podcastfy.core.character import Character
+from podcastfy.core.tts_configs import TTSConfig
+
+
+class SyncTTSBackend(ABC):
+    """Protocol for synchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        """
+        Convert text to speech synchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+            output_path (Path): The path to save the generated audio file.
+
+        Returns:
+            Path: The path to the generated audio file.
+        """
+        pass
+
+
+class AsyncTTSBackend(ABC):
+    """Protocol for asynchronous Text-to-Speech backends."""
+
+    name: str
+
+    @abstractmethod
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        """
+        Convert text to speech asynchronously.
+
+        Args:
+            text (str): The text to convert to speech.
+            character (Character): The character for which to generate speech.
+            output_path (Path): The path to save the generated audio file.
+
+        Returns:
+            Path: The path to the generated audio file.
+        """
+        pass
+class TTSConfigMixin:
+    """Mixin class to manage TTS external configurations."""
+
+    def __init__(self, config_file: str = 'podcastfy/config.yaml', name: str = "") -> None:
+        # TODO: probably bad config files for final client
+        self.name = name
+        self.config_file = config_file
+        self.default_configs = self._load_default_configs()
+        self.tts_config_call_count = 0
+        self.character_tts_mapping = {}
+
+    def _load_default_configs(self) -> Dict[str, Any]:
+        with open(self.config_file, 'r') as f:
+            config = yaml.safe_load(f)
+        tts_config = config.get('text_to_speech', {})
+        return tts_config.get(self.name, {})
+
+    def get_default_config(self) -> Dict[str, Any]:
+        return self.default_configs
+
+    def update_default_config(self, new_config: Dict[str, Any]) -> None:
+        self.default_configs.update(new_config)
+
+    def tts_config_for_character(self, character: Character) -> TTSConfig:
+        # todo a bit constrained by the fact that the config has just the question and answer fields
+        if character.name in self.character_tts_mapping:
+            return self.character_tts_mapping[character.name]
+
+        # Check if the character has a TTS config for this backend
+        if self.name in character.tts_configs:
+            tts_config = character.tts_configs[self.name]
+        else:
+            # If not, use the default config
+            default_voices = self.default_configs.get('default_voices', {})
+            if self.tts_config_call_count == 0:
+                voice = default_voices['question']
+            else:
+                voice = default_voices['answer']
+            model = self.default_configs.get('model')
+            self.tts_config_call_count += 1
+
+            tts_config = TTSConfig(
+                voice=voice,
+                backend=self.name,
+                extra_args={"model": model} if model else {}
+            )
+
+        # Merge the default config with the character-specific config
+        merged_config = TTSConfig(
+            voice=tts_config.voice or self.default_configs.get('default_voices', {}).get('question' if self.tts_config_call_count == 1 else 'answer', ''),
+            backend=self.name,
+            extra_args={**self.default_configs.get('extra_args', {}), **tts_config.extra_args}
+        )
+
+        self.character_tts_mapping[character.name] = merged_config
+        return merged_config
+
+        # This line is no longer needed as we always return a merged config
+
+    def preload_character_tts_mapping(self, characters: List[Character]) -> None:
+        for character in characters:
+            self.tts_config_for_character(character)
+
+    def get_character_tts_mapping(self) -> Dict[str, TTSConfig]:
+        return self.character_tts_mapping
diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
new file mode 100644
index 00000000..0b2d389c
--- /dev/null
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -0,0 +1,94 @@
+import os
+import uuid
+from abc import abstractmethod
+from pathlib import Path
+from tempfile import TemporaryFile, TemporaryDirectory
+from typing import Dict, Any, List, ClassVar
+import asyncio
+
+import openai
+
+import edge_tts
+from elevenlabs import client as elevenlabs_client
+
+from podcastfy.aiengines.tts.base import SyncTTSBackend, TTSConfigMixin, AsyncTTSBackend
+from podcastfy.core.character import Character
+
+
+class ElevenLabsTTS(SyncTTSBackend, TTSConfigMixin):
+    name: str = "elevenlabs"
+
+    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'):
+        # TODO: not the right path for final client
+        TTSConfigMixin.__init__(self, config_file)
+        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        config = self.tts_config_for_character(character)
+        client = elevenlabs_client.ElevenLabs(api_key=self.api_key)  # # client could be reused
+        content = client.generate(
+            text=text,
+            voice=config.voice,
+            model=config.extra_args.get('model', self.get_default_config().get('model', 'default'))
+        )
+        with open(output_path, "wb") as out:
+            for chunk in content:
+                if chunk:
+                    out.write(chunk)
+        return output_path
+
+
+class OpenAITTS(SyncTTSBackend, TTSConfigMixin):
+    name: str = "openai"
+
+    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'):
+        TTSConfigMixin.__init__(self, config_file, name=self.name)
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        config = self.tts_config_for_character(character)
+
+        print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice} \n text: {text}")
+        model = config.extra_args.get('model', self.get_default_config().get('model', 'tts-1'))
+        response = openai.audio.speech.create(
+            model=model,
+            voice=config.voice,
+            input=text
+        )
+        with open(output_path, "wb") as file:
+            file.write(response.content)
+        return output_path
+
+
+class EdgeTTS(AsyncTTSBackend, TTSConfigMixin):
+    name: str = "edge-tts"
+
+    def __init__(self, config_file: str = 'podcastfy/config.yaml'):
+        TTSConfigMixin.__init__(self, config_file)
+
+    async def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
+        config = self.tts_config_for_character(character)
+        communicate = edge_tts.Communicate(text, config.voice)
+        await communicate.save(output_path)
+        return output_path
+
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        return await self.text_to_speech(text, character, output_path)
+
+
+
+# Example usage:
+if __name__ == "__main__":
+    from podcastfy.utils.config import load_config
+
+    config = load_config()
+    elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY)
+    openai_tts = OpenAITTS(config.OPENAI_API_KEY)
+    edge_tts = EdgeTTS()
+
+    dummy_character1 = Character("character1", "host", {}, "A friendly podcast host")
+    dummy_character2 = Character("character2", "guest", {}, "An expert guest")
+
+    output_dir = Path("output")
+    output_dir.mkdir(exist_ok=True)
+    
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
new file mode 100644
index 00000000..81fdd4fc
--- /dev/null
+++ b/podcastfy/client_v2.py
@@ -0,0 +1,239 @@
+import os
+import uuid
+import typer
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Union
+
+from podcastfy.aiengines.llm.legacy_gemini_langchain import DefaultPodcastifyTranscriptEngine
+from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
+from podcastfy.core.character import Character
+from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend
+from podcastfy.core.transcript import Transcript
+from podcastfy.content_parser.content_extractor import ContentExtractor
+from podcastfy.core.tts_configs import TTSConfig
+from podcastfy.utils.config import Config, load_config
+from podcastfy.utils.logger import setup_logger
+
+logger = setup_logger(__name__)
+
+app = typer.Typer()
+
+def create_characters(config: Dict[str, Any]) -> List[Character]:
+    host = Character(
+        name="Host",
+        role="Podcast host",
+        tts_configs={
+            "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["question"], backend="openai"),
+            "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["question"], backend="elevenlabs"),
+        },
+        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly."
+    )
+
+    guest = Character(
+        name="Guest",
+        role="Expert guest",
+        tts_configs={
+            "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], backend="openai"),
+            "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["answer"], backend="elevenlabs"),
+        },
+        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner."
+    )
+
+    return [host, guest]
+
+def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]:
+    return [
+        OpenAITTS(api_key=config.OPENAI_API_KEY),
+        ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY),
+        EdgeTTS()
+    ]
+
+def process_links(
+    links: List[str],
+    transcript_file: Optional[str] = None,
+    tts_model: str = "openai",  # could be removed now ?
+    generate_audio: bool = True,
+    config: Optional[Config] = None,
+    conversation_config: Optional[Dict[str, Any]] = None
+) -> Podcast:
+    if config is None:
+        config = load_config()
+    characters = create_characters(config.config)
+    tts_backends = create_tts_backends(config)
+    if transcript_file:
+        logger.info(f"Using transcript file: {transcript_file}")
+        transcript = Transcript.load(transcript_file, {char.name: char for char in characters})
+        podcast = Podcast.from_transcript(transcript, tts_backends, characters)
+    else:
+        logger.info(f"Processing {len(links)} links")
+        content_extractor = ContentExtractor(config.JINA_API_KEY)
+        content_generator = DefaultPodcastifyTranscriptEngine(config.GEMINI_API_KEY, conversation_config)
+
+        contents = [content_extractor.extract_content(link) for link in links]
+        combined_content = "\n\n".join(contents)
+
+        llm_backend = content_generator  # Assuming ContentGenerator implements the LLMBackend interface
+
+        podcast = Podcast(
+            content=combined_content,
+            llm_backend=llm_backend,
+            tts_backends=tts_backends,
+            characters=characters,
+        )
+
+    if generate_audio:
+        podcast.finalize()
+    else:
+        podcast.build_transcript()
+
+    return podcast
+
+
+@app.command()
+def main(
+    urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"),
+    file: typer.FileText = typer.Option(None, "--file", "-f", help="File containing URLs, one per line"),
+    transcript: typer.FileText = typer.Option(None, "--transcript", "-t", help="Path to a transcript file"),
+    tts_model: str = typer.Option(None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"),
+    transcript_only: bool = typer.Option(False, "--transcript-only", help="Generate only a transcript without audio"),
+    conversation_config: str = typer.Option(None, "--conversation-config", "-cc", help="Path to custom conversation configuration YAML file"),
+    output_dir: str = typer.Option("./output", "--output-dir", "-o", help="Directory to save output files"),
+):
+    """
+    Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
+    """
+    try:
+        config = load_config()
+        main_config = config.config.get('main', {})
+        if tts_model is None:
+            tts_model = main_config.get('default_tts_model', 'openai')
+
+        urls_list = urls or []
+        if file:
+            urls_list.extend([line.strip() for line in file if line.strip()])
+
+        if not urls_list and not transcript:
+            raise typer.BadParameter(
+                "No URLs or transcript provided. Use --url to specify URLs, --file to specify a file containing URLs, or --transcript for a transcript file."
+            )
+
+        podcast = process_links(
+            urls_list,
+            transcript_file=transcript.name if transcript else None,
+            tts_model=tts_model,
+            generate_audio=not transcript_only,
+            config=config,
+            conversation_config=conversation_config
+        )
+
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        if transcript_only:
+            transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt"
+            podcast.export_transcript(str(transcript_file))
+            typer.echo(f"Transcript generated successfully: {transcript_file}")
+        else:
+            audio_file = output_dir / f"podcast_{uuid.uuid4().hex}.mp3"
+            podcast.save(str(audio_file))
+            transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt"
+            podcast.export_transcript(str(transcript_file))
+            typer.echo(f"Podcast generated successfully using {tts_model} TTS model: {audio_file}")
+            typer.echo(f"Transcript saved to: {transcript_file}")
+
+    except Exception as e:
+        typer.echo(f"An error occurred: {str(e)}", err=True)
+        raise typer.Exit(code=1)
+
+if __name__ == "__main__":
+    app()
+
+def generate_podcast(
+    urls: Optional[List[str]] = None,
+    url_file: Optional[str] = None,
+    transcript_file: Optional[str] = None,
+    tts_model: Optional[str] = None,
+    transcript_only: bool = False,
+    config: Optional[Dict[str, Any]] = None,
+    conversation_config: Optional[Dict[str, Any]] = None
+) -> Podcast:
+    """
+    Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
+
+    Args:
+        urls (Optional[List[str]]): List of URLs to process.
+        url_file (Optional[str]): Path to a file containing URLs, one per line.
+        transcript_file (Optional[str]): Path to a transcript file.
+        tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs').
+        transcript_only (bool): Generate only a transcript without audio. Defaults to False.
+        config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
+        conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
+
+    Returns:
+        Podcast: An instance of the Podcast class representing the generated podcast.
+
+    Example:
+        >>> from podcastfy.client_v2 import generate_podcast
+        >>> podcast = generate_podcast(
+        ...     urls=['https://example.com/article1', 'https://example.com/article2'],
+        ...     tts_model='elevenlabs',
+        ...     config={
+        ...         'main': {
+        ...             'default_tts_model': 'elevenlabs'
+        ...         },
+        ...         'output_directories': {
+        ...             'audio': '/custom/path/to/audio',
+        ...             'transcripts': '/custom/path/to/transcripts'
+        ...         }
+        ...     },
+        ...     conversation_config={
+        ...         'word_count': 150,
+        ...         'conversation_style': ['informal', 'friendly'],
+        ...         'podcast_name': 'My Custom Podcast'
+        ...     }
+        ... )
+        >>> podcast.save('/path/to/output.mp3')
+        >>> podcast.export_transcript('/path/to/transcript.txt')
+    """
+    try:
+        default_config = load_config()
+
+        if config:
+            if isinstance(config, dict):
+                updated_config = Config()
+                updated_config.configure(**config)
+                default_config = updated_config
+            elif isinstance(config, Config):
+                default_config = config
+            else:
+                raise ValueError("Config must be either a dictionary or a Config object")
+
+        main_config = default_config.config.get('main', {})
+
+        if tts_model is None:
+            tts_model = main_config.get('default_tts_model', 'openai')
+
+        urls_list = urls or []
+        if url_file:
+            with open(url_file, 'r') as file:
+                urls_list.extend([line.strip() for line in file if line.strip()])
+
+        if not urls_list and not transcript_file:
+            raise ValueError(
+                "No URLs or transcript provided. Please provide either 'urls', 'url_file', or 'transcript_file'."
+            )
+
+        podcast = process_links(
+            urls_list,
+            transcript_file=transcript_file,
+            tts_model=tts_model,
+            generate_audio=not transcript_only,
+            config=default_config,
+            conversation_config=conversation_config
+        )
+
+        return podcast
+
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+        raise
\ No newline at end of file
diff --git a/podcastfy/core/__init__.py b/podcastfy/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
new file mode 100644
index 00000000..7d4c383e
--- /dev/null
+++ b/podcastfy/core/audio.py
@@ -0,0 +1,91 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Optional, Dict, Union, List, cast
+
+from pydub import AudioSegment as PydubAudioSegment
+
+from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend
+from podcastfy.core.transcript import TranscriptSegment, Transcript
+
+
+class AudioSegment:
+    """Represents an audio segment of the podcast."""
+
+    def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None) -> None:
+        self.filepath = filepath
+        self.length_ms = length_ms
+        self.transcript_segment = transcript_segment
+        self._audio: Optional[PydubAudioSegment] = None
+
+    @property
+    def audio(self) -> PydubAudioSegment:
+        """Lazy-load the audio segment."""
+        if self._audio is None:
+            self._audio = PydubAudioSegment.from_file(self.filepath)
+            if len(self._audio) != self.length_ms:
+                raise ValueError(
+                    f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)")
+        return self._audio
+
+
+class AudioManager:
+    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1) -> None:
+        self.tts_backends = tts_backends
+        self.n_jobs = n_jobs
+        self.audio_segments = []
+        self.final_audio: Optional[PydubAudioSegment] = None
+        self.temp_dir: Optional[Union[str, Path]] = None
+
+    async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        async def process_segment(segment: TranscriptSegment):
+            tts_backend = self.get_tts_backend(segment)
+            audio_file = await cast(AsyncTTSBackend, tts_backend).async_text_to_speech(
+                segment.text,
+                segment.speaker,
+                Path(self.temp_dir) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3"
+            )
+            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment)
+
+        semaphore = asyncio.Semaphore(self.n_jobs)
+
+        async def bounded_process_segment(segment):
+            async with semaphore:
+                return await process_segment(segment)
+
+        tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments]
+        return list(await asyncio.gather(*tasks))
+
+    def get_tts_backend(self, segment):
+        if segment.speaker.preferred_tts is None:
+            # take the first available TTS backend
+            tts_backend = next(iter(self.tts_backends.values()))
+        else:
+            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
+            # ensure the preferred TTS backend is available
+            if tts_backend is None:
+                raise ValueError(f"Preferred TTS backend '{segment.speaker.preferred_tts}' is not available for character '{segment.speaker.name}'")
+        return tts_backend
+
+    def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        def process_segment(segment: TranscriptSegment):
+            tts_backend = self.get_tts_backend(segment)
+            audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech(
+                segment.text,
+                segment.speaker,
+                Path(str(self.temp_dir)) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3"
+            )
+            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment)
+
+
+        with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
+            return list(executor.map(process_segment, transcript.segments))
+
+    def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
+        if all(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()):
+            return asyncio.run(self._async_build_audio_segments(transcript))
+        else:
+            return self._sync_build_audio_segments(transcript)
+
+    def stitch_audio_segments(self) -> None:
+        self.final_audio = sum([segment.audio for segment in self.audio_segments])
diff --git a/podcastfy/character.py b/podcastfy/core/character.py
similarity index 73%
rename from podcastfy/character.py
rename to podcastfy/core/character.py
index f225ae4e..ad6cdc22 100644
--- a/podcastfy/character.py
+++ b/podcastfy/core/character.py
@@ -1,23 +1,18 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Optional
 
-from pydantic import BaseModel
+from podcastfy.core.tts_configs import TTSConfig
 
-class VoiceConfig(BaseModel):
-    voice: str
-    extra_args: Dict[str, Any]
-
-class TTSConfig(VoiceConfig):
-    backend: str
 
 class Character:
     """Represents a character in the podcast."""
 
-    def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""):
+    def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {},
+                 default_description_for_llm: str = ""):
         self.name = name
         self.role = role
         self.tts_configs = tts_configs
         self.default_description_for_llm = default_description_for_llm
-        self.preferred_tts = next(iter(tts_configs.keys()))  # Set first TTS as default
+        self.preferred_tts = next(iter(tts_configs.keys()), None)  # Set first TTS as default, can be None
 
     def set_preferred_tts(self, tts_name: str):
         if tts_name not in self.tts_configs:
@@ -26,6 +21,7 @@ def set_preferred_tts(self, tts_name: str):
 
     def to_prompt(self) -> str:
         """Convert the character information to a prompt for the LLM."""
+        #TODO: could be improved by adding more information than roles
         return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}"
 
     def get_tts_args(self, tts_name: Optional[str] = None) -> TTSConfig:
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 646fbdbf..06a5e47c 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -1,15 +1,19 @@
-import logging
-from abc import ABC, abstractmethod
 from enum import Enum
 from pathlib import Path
-from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, Type, NamedTuple
+from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, cast
+from tempfile import TemporaryDirectory
+import atexit
 from pydub import AudioSegment as PydubAudioSegment
 from functools import wraps
-import asyncio
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import contextmanager
 
-from podcastfy.character import Character, TTSConfig
+from podcastfy.aiengines.llm.base import LLMBackend
+from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend
+from podcastfy.core.audio import AudioSegment, AudioManager
+from podcastfy.core.character import Character
+from podcastfy.core.transcript import TranscriptSegment, Transcript
+from podcastfy.core.tts_configs import TTSConfig
+
 
 class PodcastState(Enum):
     """Enum representing the different states of a podcast during creation."""
@@ -19,170 +23,26 @@ class PodcastState(Enum):
     STITCHED = 3  # Final state after all audio segments have been combined
 
 
-class LLMBackend(ABC):
-    """Abstract base class for Language Model backends."""
-
-    @abstractmethod
-    def generate_text(self, prompt: str, characters: List['Character']) -> List[Tuple[Character, str]]:
-        """
-        Generate text based on a given prompt.
-
-        Args:
-            prompt (str): The input prompt for text generation.
-
-        Returns:
-            List[Tuple[str, str]]: A list of tuples containing speaker and text.
-        """
-        pass
-
-
-class SyncTTSBackend(ABC):
-    """Protocol for synchronous Text-to-Speech backends."""
-
-    name: str
-
-    @abstractmethod
-    def text_to_speech(self, text: str, character: 'Character') -> Path:
-        """
-        Convert text to speech synchronously.
-
-        Args:
-            text (str): The text to convert to speech.
-            character (Character): The character for which to generate speech.
-
-        Returns:
-            Path: Path to the generated audio file.
-        """
-        pass
-
-
-class AsyncTTSBackend(ABC):
-    """Protocol for asynchronous Text-to-Speech backends."""
-
-    name: str
-
-    @abstractmethod
-    async def async_text_to_speech(self, text: str, character: 'Character') -> Path:
-        """
-        Convert text to speech asynchronously.
-
-        Args:
-            text (str): The text to convert to speech.
-            character (Character): The character for which to generate speech.
-
-        Returns:
-            Path: Path to the generated audio file.
-        """
-        pass
-
-
-class TranscriptSegment:
-    """Represents a segment of the podcast transcript."""
-
-    def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None):
-        self.text = text
-        self.speaker = speaker
-        self.tts_args = tts_args or {}
-
-
-class Transcript:
-    """Represents the full transcript of a podcast."""
-
-    def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]):
-        self.segments = segments
-        self.metadata = metadata
-
-    def save(self, filepath: str, format: str = "plaintext"):
-        """Save the transcript to a file."""
-        with open(filepath, 'w') as f:
-            f.write(str(self))
-
-    def __str__(self) -> str:
-        """Convert the transcript to a string representation."""
-        lines = []
-        for segment in self.segments:
-            lines.append(f"{segment.speaker.name}: {segment.text}")
-
-        metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()])
-
-        return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines)
-
-
-class AudioSegment:
-    """Represents an audio segment of the podcast."""
-
-    def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None):
-        self.filepath = filepath
-        self.length_ms = length_ms
-        self.transcript_segment = transcript_segment
-        self._audio: Optional[PydubAudioSegment] = None
-
-    @property
-    def audio(self) -> PydubAudioSegment:
-        """Lazy-load the audio segment."""
-        if self._audio is None:
-            self._audio = PydubAudioSegment.from_file(self.filepath)
-            if len(self._audio) != self.length_ms:
-                raise ValueError(
-                    f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)")
-        return self._audio
-
-
-class AudioManager:
-    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1):
-        self.tts_backends = tts_backends
-        self.n_jobs = n_jobs
-        self.audio_segments = []
-        self.final_audio = None
-
-    async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        async def process_segment(segment: TranscriptSegment):
-            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
-            audio_file = await tts_backend.async_text_to_speech(segment.text, segment.speaker)
-            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment)
-
-        semaphore = asyncio.Semaphore(self.n_jobs)
-
-        async def bounded_process_segment(segment):
-            async with semaphore:
-                return await process_segment(segment)
-
-        tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments]
-        return await asyncio.gather(*tasks)
-
-    def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        def process_segment(segment: TranscriptSegment):
-            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
-            audio_file = tts_backend.text_to_speech(segment.text, segment.speaker)
-            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment)
-
-        with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
-            return list(executor.map(process_segment, transcript.segments))
-
-    def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        if any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()):
-            return asyncio.run(self._async_build_audio_segments(transcript))
-        else:
-            return self._sync_build_audio_segments(transcript)
-
-    def stitch_audio_segments(self):
-        self.final_audio = sum([segment.audio for segment in self.audio_segments])
-
-
 def podcast_stage(func):
     """Decorator to manage podcast stage transitions."""
 
+    @wraps(func)
+    def probably_same_func(method, func):
+        return method.__func__.__name__ == func.__name__
+
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         current_method = self._next_stage_methods[self.state]
-        if current_method != func and not self._reworking:
+        print(f"Executing {func.__name__} in state {self.state.name}")
+        if not probably_same_func(current_method, func) and not self._reworking:
             print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.")
-            return
+            raise Exception(f"Cannot execute {func.__name__} in current state {self.state.name}")
 
         try:
             result = func(self, *args, **kwargs)
-            next_state = next((state for state, method in self._next_stage_methods.items() if method == func), None)
+            next_state = PodcastState(self.state.value + 1)
             self.state = next_state or self.state
+            print(f"Transitioned to state {self.state.name}")
             return result
         except Exception as e:
             print(f"Error in {func.__name__}: {str(e)}")
@@ -195,15 +55,18 @@ class Podcast:
     """Main class for podcast creation and management."""
 
     def __init__(self, content: str, llm_backend: LLMBackend,
-                 tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]],
-                 characters: List[Character], default_tts_n_jobs: int = 1):
+                 tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], audio_temp_dir: Optional[Union[str, Path]] = None,
+                 characters: Optional[List[Character]] = None,
+                 default_tts_n_jobs: int = 1) -> None:
         """
         Initialize a new Podcast instance.
 
         Args:
             content (str): The raw content to be processed into a podcast.
             llm_backend (LLMBackend): The language model backend for generating the transcript.
-            tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends.
+            tts_backends (List[Union[SyncTTSBackend, AsyncTTSBackend]]): List of available TTS backends.
+            audio_temp_dir (Optional[str]): Path to a temporary directory for audio files. If None, a temporary
+                directory will be created.
             characters (List[Character]): List of characters participating in the podcast.
             default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing.
                 Defaults to 1.
@@ -213,17 +76,25 @@ def __init__(self, content: str, llm_backend: LLMBackend,
         """
         self.content = content
         self.llm_backend = llm_backend
-        self.tts_backends = {backend.name: backend for backend in tts_backends}
-        self.characters = {char.name: char for char in characters}
+        self.tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]] = {backend.name: backend for backend in tts_backends}
+        self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])}
         self.default_tts_n_jobs = default_tts_n_jobs
         self.state = PodcastState.INITIALIZED
         self._reworking = False
+        
+        if audio_temp_dir:
+            self.temp_dir = Path(audio_temp_dir)
+        else:
+            self._temp_dir = TemporaryDirectory()
+            self.temp_dir = Path(self._temp_dir.name)
+            atexit.register(self._temp_dir.cleanup)
         self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs)
+        self.audio_manager.temp_dir = self.temp_dir
 
         # Initialize attributes with null values
-        self.transcript = None
-        self.audio_segments = []
-        self.audio = None
+        self.transcript: Optional[Transcript] = None
+        self.audio_segments: List[AudioSegment] = []
+        self.audio: Optional[PydubAudioSegment] = None
 
         # Define the sequence of methods to be called for each stage
         self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = {
@@ -232,10 +103,14 @@ def __init__(self, content: str, llm_backend: LLMBackend,
             PodcastState.AUDIO_SEGMENTS_BUILT: self.stitch_audio_segments,
         }
 
+    def __del__(self) -> None:
+        if hasattr(self, '_temp_dir'):
+            self._temp_dir.cleanup()
+
     @classmethod
     def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript],
-                        tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], characters: List[Character],
-                        default_tts_n_jobs: int = 1) -> 'Podcast':
+                        tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]],
+                        characters: List[Character], default_tts_n_jobs: int = 1) -> 'Podcast':
         """
         Create a Podcast instance from a pre-existing transcript.
 
@@ -249,16 +124,16 @@ def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript
         Returns:
             Podcast: A new Podcast instance with the transcript built and ready for audio generation.
         """
-        podcast = cls("", None, list(tts_backends.values()), characters, default_tts_n_jobs=default_tts_n_jobs)
         if isinstance(transcript, Transcript):
+            podcast = cls("", cast(LLMBackend, None), tts_backends, characters=characters, default_tts_n_jobs=default_tts_n_jobs)
             podcast.transcript = transcript
         else:
             raise ValueError("Transcript must be a Transcript instance")  # unimplemented
         podcast.state = PodcastState.TRANSCRIPT_BUILT
         return podcast
 
-    def reset_to_state(self, state: PodcastState):
-        """Reset the podcast to a specific state."""
+    def reset_to_state(self, state: PodcastState) -> None:
+        """Reset the podcast to a specific state. """
         self.state = state
         self.transcript = None if state.value < PodcastState.TRANSCRIPT_BUILT.value else self.transcript
         self.audio_segments = [] if state.value < PodcastState.AUDIO_SEGMENTS_BUILT.value else self.audio_segments
@@ -270,6 +145,12 @@ def rework(self, target_state: PodcastState, auto_finalize: bool = True):
         original_state = self.state
         self._reworking = True
 
+        if target_state == PodcastState.INITIALIZED and self.llm_backend is None:
+            raise ValueError("Cannot rewind to INITIALIZED state without an LLM backend.")
+        
+        if target_state.value < PodcastState.TRANSCRIPT_BUILT.value and self.llm_backend is None:
+            raise ValueError("Cannot rewind past TRANSCRIPT_BUILT state without an LLM backend.")
+
         if target_state.value < self.state.value:
             print(f"Rewinding from {self.state.name} to {target_state.name}")
             self.reset_to_state(target_state)
@@ -289,17 +170,27 @@ def build_transcript(self) -> None:
         """Build the podcast transcript using the LLM backend."""
         character_prompts = "\n\n".join([char.to_prompt() for char in self.characters.values()])
         full_prompt = f"{self.content}\n\nCharacters:\n{character_prompts}"
-        generated_segments = self.llm_backend.generate_text(full_prompt, list(self.characters.values()))
+        generated_segments = self.llm_backend.generate_transcript(full_prompt, list(self.characters.values()))
 
-        segments = [TranscriptSegment(text, speaker, self.characters[speaker])
-                    for speaker, text in generated_segments if speaker in self.characters]
+        segments = []
+        for segment in generated_segments:
+            if isinstance(segment, tuple) and len(segment) == 2:
+                speaker, text = segment
+                if speaker.name in self.characters:
+                    tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {}))
+                    segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config))
+            # If the segment doesn't match the expected format, we'll skip it
 
         self.transcript = Transcript(segments, {"source": "Generated content"})
 
     @podcast_stage
-    def build_audio_segments(self, n_jobs: Optional[int] = None) -> None:
+    def build_audio_segments(self) -> None:
         """Build audio segments from the transcript."""
-        self.audio_segments = self.audio_manager.create_audio_segments(self.transcript)
+        if self.transcript is not None:
+            self.audio_segments = self.audio_manager.create_audio_segments(self.transcript)
+        else:
+            print("Error: Transcript is None")
+            raise ValueError("Transcript must be built before creating audio segments")
 
     @podcast_stage
     def stitch_audio_segments(self) -> None:
@@ -330,16 +221,34 @@ def save(self, filepath: str) -> None:
         else:
             raise ValueError("No stitched audio to save")
 
-    def save_transcript(self, filepath: str, format: str = "plaintext") -> None:
+    def export_transcript(self, filepath: str, format_: str = "plaintext") -> None:
         """Save the podcast transcript to a file."""
         if self.state < PodcastState.TRANSCRIPT_BUILT:
             raise ValueError("Transcript can only be saved after it is built")
 
         if self.transcript:
-            self.transcript.save(filepath, format)
+            self.transcript.export(filepath, format_)
         else:
             raise ValueError("No transcript to save")
 
+    def dump_transcript(self, filepath: str) -> None:
+        """Dump the podcast transcript to a JSON file."""
+        if self.state < PodcastState.TRANSCRIPT_BUILT:
+            raise ValueError("Transcript can only be dumped after it is built")
+
+        if self.transcript:
+            self.transcript.dump(filepath)
+        else:
+            raise ValueError("No transcript to dump")
+
+    @classmethod
+    def load_transcript(cls, filepath: str, tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]],
+                        characters: List[Character]) -> 'Podcast':
+        """Load a podcast from a transcript JSON file."""
+        character_dict = {char.name: char for char in characters}
+        transcript = Transcript.load(filepath, character_dict)
+        return cls.from_transcript(transcript, tts_backends, characters)
+
 
 # Usage example: Step-by-step podcast creation
 if __name__ == "__main__":
@@ -347,18 +256,18 @@ def save_transcript(self, filepath: str, format: str = "plaintext") -> None:
 
 
     class DummyLLMBackend(LLMBackend):
-        def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[str, str]]:
-            return [("Host", "Welcome to our podcast!"), ("Guest", "Thanks for having me!")]
+        def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
+            return [(characters[0], "Welcome to our podcast!"), (characters[1], "Thanks for having me!")]
 
 
     class DummyTTSBackend(SyncTTSBackend):
         def __init__(self, name: str):
             self.name = name
 
-        def text_to_speech(self, text: str, character: Character) -> Path:
-            with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
-                PydubAudioSegment.silent(duration=1000).export(temp_file.name, format="mp3")
-            return Path(temp_file.name)
+        def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+            audio = PydubAudioSegment.silent(duration=1000)
+            audio.export(str(output_path), format="mp3")
+            return output_path
 
 
     # Define TTS backends
@@ -366,22 +275,23 @@ def text_to_speech(self, text: str, character: Character) -> Path:
     elevenlabs_tts = DummyTTSBackend("elevenlabs")
 
     # Define TTS backends
-
-    # Define characters
     host = Character(
         name="Host",
         role="Podcast host",
         tts_configs={
-            "openai": {"voice": "en-US-Neural2-F", "backend": "openai", "extra_args": {"speaking_rate": 1.0}},
-            "elevenlabs": {"voice": "Rachel", "backend": "elevenlabs", "extra_args": {"stability": 0.5}}
+            "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}),
+            "elevenlabs": TTSConfig(voice="Rachel", backend="elevenlabs", extra_args={"stability": 0.5})
         },
         default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly."
     )
+
     guest = Character(
         name="Guest",
         role="Expert guest",
-        tts_configs={"openai": {"voice": "en-US-Neural2-D", "backend": "openai", "extra_args": {"pitch": -2.0}},
-                     "elevenlabs": {"voice": "Antoni", "backend": "elevenlabs", "extra_args": {"stability": 0.8}}},
+        tts_configs={
+            "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}),
+            "elevenlabs": TTSConfig(voice="Antoni", backend="elevenlabs", extra_args={"stability": 0.8})
+        },
         default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner."
     )
 
@@ -415,7 +325,7 @@ def text_to_speech(self, text: str, character: Character) -> Path:
     with podcast.rework(PodcastState.TRANSCRIPT_BUILT):
         print(f"Inside rework context, state: {podcast.state}")
         podcast.transcript.segments.append(
-            TranscriptSegment("This is a new segment", "Host", podcast.characters["Host"]))
+            TranscriptSegment("This is a new segment", podcast.characters["Host"]))
         print("Added new segment to transcript")
 
         # Rebuild audio segments and stitch
@@ -429,12 +339,12 @@ def text_to_speech(self, text: str, character: Character) -> Path:
 
     with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT):
         new_segment = AudioSegment(Path(temp_file.name), 500,
-                                   TranscriptSegment("New audio segment", "Host", podcast.characters["Host"]))
+                                   TranscriptSegment("New audio segment", podcast.characters["Host"]))
         podcast.audio_segments.insert(0, new_segment)
 
     # Save the final podcast
     podcast.save("./final.mp3")
-    podcast.save_transcript("./final.txt", format="plaintext")
+    podcast.export_transcript("./final.txt", format_="plaintext")
     print("Saved podcast and transcript")
 
     # Example with pre-existing transcript using from_transcript class method
diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
new file mode 100644
index 00000000..952fa2be
--- /dev/null
+++ b/podcastfy/core/transcript.py
@@ -0,0 +1,95 @@
+import json
+import re
+from typing import Optional, Dict, Any, List, Tuple
+
+from podcastfy.core.character import Character
+
+
+class TranscriptSegment:
+    def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None) -> None:
+        self.text = text
+        self.speaker = speaker
+        self.tts_args = tts_args or {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "text": self.text,
+            "speaker": self.speaker.name,
+            "tts_args": self.tts_args
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any], characters: Dict[str, Character]) -> 'TranscriptSegment':
+        return cls(
+            text=data['text'],
+            speaker=characters[data['speaker']],
+            tts_args=data.get('tts_args', {})
+        )
+
+
+class Transcript:
+    def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]) -> None:
+        self.segments = segments
+        self.metadata = metadata
+
+    def export(self, filepath: str, format_: str = "plaintext") -> None:
+        """Export the transcript to a file."""
+        with open(filepath, 'w') as f:
+            if format_ == "plaintext":
+                f.write(str(self))
+            elif format_ == "json":
+                json.dump(self.to_dict(), f, indent=2)
+            else:
+                raise ValueError(f"Unsupported format: {format_}")
+
+    def dump(self, filepath: str) -> None:
+        """Dump the transcript to a JSON file."""
+        with open(filepath, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+
+    @staticmethod
+    def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]:
+        pattern = r'<Person(\d)>\s*(.*?)\s*</Person\1>'
+        matches = re.findall(pattern, content, re.DOTALL)
+        return [('Person' + person_num, text) for person_num, text in matches]
+
+    @classmethod
+    def load(cls, filepath: str, characters: Dict[str, Character]) -> 'Transcript':
+        """Load a transcript from a JSON file."""
+        with open(filepath, 'r') as f:
+            content = f.read()
+
+        try:
+            data = json.loads(content)
+            segments = [TranscriptSegment.from_dict(seg, characters) for seg in data['segments']]
+        except json.JSONDecodeError:
+            # If JSON parsing fails, assume it's a legacy transcript
+            parsed_content = cls._parse_legacy_transcript(content)
+            segments = []
+            for speaker, text in parsed_content:
+                if speaker in characters:
+                    character = characters[speaker]
+                else:
+                    # Create a new character if it doesn't exist
+                    character = Character(speaker, f"Character {speaker}", {})
+                    characters[speaker] = character
+                segments.append(TranscriptSegment(text, character))
+
+        data = {'segments': segments, 'metadata': {}}
+        return cls(segments, data['metadata'])
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "segments": [segment.to_dict() for segment in self.segments],
+            "metadata": self.metadata
+        }
+
+    def __str__(self) -> str:
+        """Convert the transcript to a string representation."""
+        lines = []
+        for segment in self.segments:
+            lines.append(f"{segment.speaker.name}: {segment.text}")
+
+        metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()])
+
+        return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines)
diff --git a/podcastfy/core/tts_configs.py b/podcastfy/core/tts_configs.py
new file mode 100644
index 00000000..c46ed25c
--- /dev/null
+++ b/podcastfy/core/tts_configs.py
@@ -0,0 +1,12 @@
+from typing import Dict, Any
+
+from pydantic import BaseModel
+
+
+class VoiceConfig(BaseModel):
+    voice: str
+    extra_args: Dict[str, Any] = {}
+
+
+class TTSConfig(VoiceConfig):
+    backend: str
diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py
deleted file mode 100644
index dc53859a..00000000
--- a/podcastfy/tts_backends.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import os
-import uuid
-from abc import ABC, abstractmethod
-from pathlib import Path
-from tempfile import TemporaryFile, TemporaryDirectory
-from typing import Dict, Any, List, ClassVar
-import asyncio
-
-import openai
-
-from podcastfy.character import Character, VoiceConfig
-import edge_tts
-from elevenlabs import client as elevenlabs_client
-
-class TTSBackend(ABC):
-    name: ClassVar[str] = ""
-    default_voices: ClassVar[List[VoiceConfig]] = []
-
-    @classmethod
-    def set_default_voices(cls, voices: List[VoiceConfig]):
-        """
-        Set the default voices for the TTS backend.
-        """
-        cls.default_voices = voices
-
-    @abstractmethod
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-        """
-        Convert text to speech.
-
-        Args:
-            text (str): The text to convert to speech.
-            character (Character): The character for which to generate speech.
-            output_path (Path): The path where the audio file should be saved.
-
-        Returns:
-            Path: Path to the generated audio file (same as output_path).
-        """
-        pass
-
-class ElevenLabsTTS(TTSBackend):
-    name: str = "elevenlabs"
-
-    def __init__(self, api_key: str = None):
-        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
-
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-        # TODO, would be nicer to get a filepath directly from the client
-        config = character.get_tts_args('elevenlabs')
-        client = elevenlabs_client.ElevenLabs(api_key=self.api_key)  # # client could be reused
-        content = client.generate(
-            text=text,
-            voice=config.voice,
-            model=config.extra_args.get('model', 'default')
-        )
-        with open(output_path, "wb") as out:
-            for chunk in content:
-                if chunk:
-                    out.write(chunk)
-        return output_path
-
-class OpenAITTS(TTSBackend):
-    name: str = "openai"
-    def __init__(self, api_key: str):
-        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
-
-    def ensure_characters_tts_config_is_valid(self, character:Character) -> None:
-        # TODO: maybe that should be in the ABC class
-        tts_config = character.tts_configs.get('openai')
-        if not tts_config:
-            raise ValueError(f"Character '{character.name}' does not have OpenAI TTS configuration")
-        # ensure there is a key model in the extra_args
-        if 'model' not in tts_config.extra_args:
-            raise ValueError(f"Character '{character.name}' does not have the 'model' key in the OpenAI TTS configuration")
-
-
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-        # TODO, would be nicer to get a filepath directly from the client. If not given takes tempdir from the config ?
-        self.ensure_characters_tts_config_is_valid(character)
-        # Placeholder for OpenAI TTS implementation
-        config = character.get_tts_args('openai')
-
-        print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice}")
-        response = openai.audio.speech.create(
-            model=config.extra_args["model"],
-            voice=config.voice,
-            input=text
-        )
-        with open(output_path, "wb") as file:
-            file.write(response.content)
-        return output_path
-
-class EdgeTTS(TTSBackend):
-    name: str = "edge-tts"
-
-
-    def __init__(self):
-        pass
-
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-        config = character.get_tts_args('edge-tts')
-
-        async def edge_tts_conversion(text: str, output_path: str, voice: str):
-            communicate = edge_tts.Communicate(text, voice)
-            await communicate.save(output_path)
-
-        asyncio.run(edge_tts_conversion(text, str(output_path), config.voice))
-
-        return output_path
-
-
-    def ensure_characters_tts_config_is_valid(self, character: Character) -> None:
-        tts_config = character.tts_configs.get('edge-tts')
-        if not tts_config:
-            raise ValueError(f"Character '{character.name}' does not have Edge TTS configuration")
-
-# Example usage:
-if __name__ == "__main__":
-    from podcastfy.utils.config import load_config
-
-    config = load_config()
-    elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {}))
-    openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {}))
-    # edge_tts = EdgeTTS()
-
-    dummy_character = Character("John", "host", {
-        'elevenlabs': {'voice': 'en-US-JohnNeural'},
-        'openai': {'voice': 'en-US-Neural2-C'},
-        'edge-tts': {'voice': 'en-US-ChristopherNeural'}
-    }, "A friendly podcast host")
-
-    output_dir = Path("output")
-    output_dir.mkdir(exist_ok=True)
-    output_path = output_dir / f"{dummy_character.name}_{uuid.uuid4().hex}.mp3"
-    elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character, output_path)

From 7b625c5614e12e2499f2521d70c6178fc00fa27c Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Tue, 15 Oct 2024 16:09:54 +0200
Subject: [PATCH 05/49] black and one renaming

---
 ...emini_langchain.py => gemini_langchain.py} |   0
 podcastfy/client.py                           | 246 ++++----
 podcastfy/client_v2.py                        | 100 +++-
 podcastfy/core/audio.py                       |   2 +-
 podcastfy/text_to_speech.py                   | 523 +++++++++---------
 tests/test_content_parser.py                  |   2 +-
 tests/test_genai_podcast.py                   |  26 +-
 tests/test_generate_podcast.py                | 173 +++---
 8 files changed, 586 insertions(+), 486 deletions(-)
 rename podcastfy/aiengines/llm/{legacy_gemini_langchain.py => gemini_langchain.py} (100%)

diff --git a/podcastfy/aiengines/llm/legacy_gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
similarity index 100%
rename from podcastfy/aiengines/llm/legacy_gemini_langchain.py
rename to podcastfy/aiengines/llm/gemini_langchain.py
diff --git a/podcastfy/client.py b/podcastfy/client.py
index b5b297cc..a1b6c727 100644
--- a/podcastfy/client.py
+++ b/podcastfy/client.py
@@ -12,7 +12,10 @@
 from podcastfy.content_generator import ContentGenerator
 from podcastfy.text_to_speech import TextToSpeech
 from podcastfy.utils.config import Config, load_config
-from podcastfy.utils.config_conversation import ConversationConfig, load_conversation_config
+from podcastfy.utils.config_conversation import (
+    ConversationConfig,
+    load_conversation_config,
+)
 from podcastfy.utils.logger import setup_logger
 from typing import List, Optional, Dict, Any
 import copy
@@ -23,8 +26,14 @@
 app = typer.Typer()
 
 
-def process_links(links, transcript_file=None, tts_model="openai", generate_audio=True, config=None, 
-                  conversation_config: Optional[Dict[str, Any]] = None):
+def process_links(
+    links,
+    transcript_file=None,
+    tts_model="openai",
+    generate_audio=True,
+    config=None,
+    conversation_config: Optional[Dict[str, Any]] = None,
+):
     """
     Process a list of links or a transcript file to generate a podcast or transcript.
 
@@ -49,8 +58,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi
         else:
             logger.info(f"Processing {len(links)} links")
             content_extractor = ContentExtractor(config.JINA_API_KEY)
-            content_generator = ContentGenerator(api_key=config.GEMINI_API_KEY, 
-                                                 conversation_config=conversation_config)
+            content_generator = ContentGenerator(
+                api_key=config.GEMINI_API_KEY, conversation_config=conversation_config
+            )
 
             # Extract content from links
             contents = [content_extractor.extract_content(link) for link in links]
@@ -60,7 +70,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi
 
             # Generate Q&A content
             random_filename = f"transcript_{uuid.uuid4().hex}.txt"
-            transcript_filepath = os.path.join(config.get('output_directories')['transcripts'], random_filename)
+            transcript_filepath = os.path.join(
+                config.get("output_directories")["transcripts"], random_filename
+            )
             qa_content = content_generator.generate_qa_content(
                 combined_content, output_filepath=transcript_filepath
             )
@@ -71,7 +83,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi
             )
             # Convert text to speech using the specified model
             random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
-            audio_file = os.path.join(config.get('output_directories')['audio'], random_filename)
+            audio_file = os.path.join(
+                config.get("output_directories")["audio"], random_filename
+            )
             text_to_speech.convert_to_speech(qa_content, audio_file)
             logger.info(f"Podcast generated successfully using {tts_model} TTS model")
             return audio_file
@@ -100,8 +114,10 @@ def main(
         False, "--transcript-only", help="Generate only a transcript without audio"
     ),
     conversation_config: str = typer.Option(
-        None, "--conversation-config", "-cc", 
-        help="Path to custom conversation configuration YAML file"
+        None,
+        "--conversation-config",
+        "-cc",
+        help="Path to custom conversation configuration YAML file",
     ),
 ):
     """
@@ -110,11 +126,10 @@ def main(
     try:
 
         config = load_config()
-        main_config = config.get('main', {})
+        main_config = config.get("main", {})
         # Use default TTS model from config if not specified
         if tts_model is None:
-            tts_model = main_config.get('default_tts_model', 'openai')
-
+            tts_model = main_config.get("default_tts_model", "openai")
 
         if transcript:
             final_output = process_links(
@@ -123,7 +138,7 @@ def main(
                 tts_model=tts_model,
                 generate_audio=not transcript_only,
                 conversation_config=conversation_config,
-                config=config
+                config=config,
             )
         else:
             urls_list = urls or []
@@ -136,11 +151,11 @@ def main(
                 )
 
             final_output = process_links(
-                urls_list, 
-                tts_model=tts_model, 
+                urls_list,
+                tts_model=tts_model,
                 generate_audio=not transcript_only,
                 config=config,
-                conversation_config=conversation_config
+                conversation_config=conversation_config,
             )
 
         if transcript_only:
@@ -157,106 +172,107 @@ def main(
 
 if __name__ == "__main__":
     app()
-    
 
 
 def generate_podcast(
-	urls: Optional[List[str]] = None,
-	url_file: Optional[str] = None,
-	transcript_file: Optional[str] = None,
-	tts_model: Optional[str] = None,
-	transcript_only: bool = False,
-	config: Optional[Dict[str, Any]] = None,
-	conversation_config: Optional[Dict[str, Any]] = None
+    urls: Optional[List[str]] = None,
+    url_file: Optional[str] = None,
+    transcript_file: Optional[str] = None,
+    tts_model: Optional[str] = None,
+    transcript_only: bool = False,
+    config: Optional[Dict[str, Any]] = None,
+    conversation_config: Optional[Dict[str, Any]] = None,
 ) -> Optional[str]:
-	"""
-	Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
-
-	Args:
-		urls (Optional[List[str]]): List of URLs to process.
-		url_file (Optional[str]): Path to a file containing URLs, one per line.
-		transcript_file (Optional[str]): Path to a transcript file.
-		tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs').
-		transcript_only (bool): Generate only a transcript without audio. Defaults to False.
-		config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
-		conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
-
-	Returns:
-		Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
-
-	Example:
-		>>> from podcastfy.client import generate_podcast
-		>>> result = generate_podcast(
-		...     urls=['https://example.com/article1', 'https://example.com/article2'],
-		...     tts_model='elevenlabs',
-		...     config={
-		...         'main': {
-		...             'default_tts_model': 'elevenlabs'
-		...         },
-		...         'output_directories': {
-		...             'audio': '/custom/path/to/audio',
-		...             'transcripts': '/custom/path/to/transcripts'
-		...         }
-		...     },
-		...     conversation_config={
-		...         'word_count': 150,
-		...         'conversation_style': ['informal', 'friendly'],
-		...         'podcast_name': 'My Custom Podcast'
-		...     }
-		... )
-	"""
-	try:
-		# Load default config
-		default_config = load_config()
-
-		# Update config if provided
-		if config:
-			if isinstance(config, dict):
-				# Create a deep copy of the default config
-				updated_config = copy.deepcopy(default_config)
-				# Update the copy with user-provided values
-				updated_config.configure(**config)
-				default_config = updated_config
-			elif isinstance(config, Config):
-				# If it's already a Config object, use it directly
-				default_config = config
-			else:
-				raise ValueError("Config must be either a dictionary or a Config object")
-
-		main_config = default_config.config.get('main', {})
-
-		# Use provided tts_model if specified, otherwise use the one from config
-		if tts_model is None:
-			tts_model = main_config.get('default_tts_model', 'openai')
-
-		if transcript_file:
-			return process_links(
-				[],
-				transcript_file=transcript_file,
-				tts_model=tts_model,
-				generate_audio=not transcript_only,
-				config=default_config,
-				conversation_config=conversation_config
-			)
-		else:
-			urls_list = urls or []
-			if url_file:
-				with open(url_file, 'r') as file:
-					urls_list.extend([line.strip() for line in file if line.strip()])
-
-			if not urls_list:
-				raise ValueError(
-					"No URLs provided. Please provide either 'urls', 'url_file', or 'transcript_file'."
-				)
-
-			return process_links(
-				urls_list, 
-				tts_model=tts_model, 
-				generate_audio=not transcript_only,
-				config=default_config,
-				conversation_config=conversation_config
-			)
-
-	except Exception as e:
-		logger.error(f"An error occurred: {str(e)}")
-		raise
\ No newline at end of file
+    """
+    Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
+
+    Args:
+            urls (Optional[List[str]]): List of URLs to process.
+            url_file (Optional[str]): Path to a file containing URLs, one per line.
+            transcript_file (Optional[str]): Path to a transcript file.
+            tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs').
+            transcript_only (bool): Generate only a transcript without audio. Defaults to False.
+            config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
+            conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
+
+    Returns:
+            Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
+
+    Example:
+            >>> from podcastfy.client import generate_podcast
+            >>> result = generate_podcast(
+            ...     urls=['https://example.com/article1', 'https://example.com/article2'],
+            ...     tts_model='elevenlabs',
+            ...     config={
+            ...         'main': {
+            ...             'default_tts_model': 'elevenlabs'
+            ...         },
+            ...         'output_directories': {
+            ...             'audio': '/custom/path/to/audio',
+            ...             'transcripts': '/custom/path/to/transcripts'
+            ...         }
+            ...     },
+            ...     conversation_config={
+            ...         'word_count': 150,
+            ...         'conversation_style': ['informal', 'friendly'],
+            ...         'podcast_name': 'My Custom Podcast'
+            ...     }
+            ... )
+    """
+    try:
+        # Load default config
+        default_config = load_config()
+
+        # Update config if provided
+        if config:
+            if isinstance(config, dict):
+                # Create a deep copy of the default config
+                updated_config = copy.deepcopy(default_config)
+                # Update the copy with user-provided values
+                updated_config.configure(**config)
+                default_config = updated_config
+            elif isinstance(config, Config):
+                # If it's already a Config object, use it directly
+                default_config = config
+            else:
+                raise ValueError(
+                    "Config must be either a dictionary or a Config object"
+                )
+
+        main_config = default_config.config.get("main", {})
+
+        # Use provided tts_model if specified, otherwise use the one from config
+        if tts_model is None:
+            tts_model = main_config.get("default_tts_model", "openai")
+
+        if transcript_file:
+            return process_links(
+                [],
+                transcript_file=transcript_file,
+                tts_model=tts_model,
+                generate_audio=not transcript_only,
+                config=default_config,
+                conversation_config=conversation_config,
+            )
+        else:
+            urls_list = urls or []
+            if url_file:
+                with open(url_file, "r") as file:
+                    urls_list.extend([line.strip() for line in file if line.strip()])
+
+            if not urls_list:
+                raise ValueError(
+                    "No URLs provided. Please provide either 'urls', 'url_file', or 'transcript_file'."
+                )
+
+            return process_links(
+                urls_list,
+                tts_model=tts_model,
+                generate_audio=not transcript_only,
+                config=default_config,
+                conversation_config=conversation_config,
+            )
+
+    except Exception as e:
+        logger.error(f"An error occurred: {str(e)}")
+        raise
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index 81fdd4fc..ea502d6d 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import List, Optional, Dict, Any, Union
 
-from podcastfy.aiengines.llm.legacy_gemini_langchain import DefaultPodcastifyTranscriptEngine
+from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine
 from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
 from podcastfy.core.character import Character
 from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend
@@ -18,43 +18,62 @@
 
 app = typer.Typer()
 
+
 def create_characters(config: Dict[str, Any]) -> List[Character]:
     host = Character(
         name="Host",
         role="Podcast host",
         tts_configs={
-            "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["question"], backend="openai"),
-            "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["question"], backend="elevenlabs"),
+            "openai": TTSConfig(
+                voice=config["text_to_speech"]["openai"]["default_voices"]["question"],
+                backend="openai",
+            ),
+            "elevenlabs": TTSConfig(
+                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
+                    "question"
+                ],
+                backend="elevenlabs",
+            ),
         },
-        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly."
+        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.",
     )
 
     guest = Character(
         name="Guest",
         role="Expert guest",
         tts_configs={
-            "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], backend="openai"),
-            "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["answer"], backend="elevenlabs"),
+            "openai": TTSConfig(
+                voice=config["text_to_speech"]["openai"]["default_voices"]["answer"],
+                backend="openai",
+            ),
+            "elevenlabs": TTSConfig(
+                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
+                    "answer"
+                ],
+                backend="elevenlabs",
+            ),
         },
-        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner."
+        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.",
     )
 
     return [host, guest]
 
+
 def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]:
     return [
         OpenAITTS(api_key=config.OPENAI_API_KEY),
         ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY),
-        EdgeTTS()
+        EdgeTTS(),
     ]
 
+
 def process_links(
     links: List[str],
     transcript_file: Optional[str] = None,
     tts_model: str = "openai",  # could be removed now ?
     generate_audio: bool = True,
     config: Optional[Config] = None,
-    conversation_config: Optional[Dict[str, Any]] = None
+    conversation_config: Optional[Dict[str, Any]] = None,
 ) -> Podcast:
     if config is None:
         config = load_config()
@@ -62,12 +81,16 @@ def process_links(
     tts_backends = create_tts_backends(config)
     if transcript_file:
         logger.info(f"Using transcript file: {transcript_file}")
-        transcript = Transcript.load(transcript_file, {char.name: char for char in characters})
+        transcript = Transcript.load(
+            transcript_file, {char.name: char for char in characters}
+        )
         podcast = Podcast.from_transcript(transcript, tts_backends, characters)
     else:
         logger.info(f"Processing {len(links)} links")
         content_extractor = ContentExtractor(config.JINA_API_KEY)
-        content_generator = DefaultPodcastifyTranscriptEngine(config.GEMINI_API_KEY, conversation_config)
+        content_generator = DefaultPodcastifyTranscriptEngine(
+            config.GEMINI_API_KEY, conversation_config
+        )
 
         contents = [content_extractor.extract_content(link) for link in links]
         combined_content = "\n\n".join(contents)
@@ -92,21 +115,36 @@ def process_links(
 @app.command()
 def main(
     urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"),
-    file: typer.FileText = typer.Option(None, "--file", "-f", help="File containing URLs, one per line"),
-    transcript: typer.FileText = typer.Option(None, "--transcript", "-t", help="Path to a transcript file"),
-    tts_model: str = typer.Option(None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"),
-    transcript_only: bool = typer.Option(False, "--transcript-only", help="Generate only a transcript without audio"),
-    conversation_config: str = typer.Option(None, "--conversation-config", "-cc", help="Path to custom conversation configuration YAML file"),
-    output_dir: str = typer.Option("./output", "--output-dir", "-o", help="Directory to save output files"),
+    file: typer.FileText = typer.Option(
+        None, "--file", "-f", help="File containing URLs, one per line"
+    ),
+    transcript: typer.FileText = typer.Option(
+        None, "--transcript", "-t", help="Path to a transcript file"
+    ),
+    tts_model: str = typer.Option(
+        None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"
+    ),
+    transcript_only: bool = typer.Option(
+        False, "--transcript-only", help="Generate only a transcript without audio"
+    ),
+    conversation_config: str = typer.Option(
+        None,
+        "--conversation-config",
+        "-cc",
+        help="Path to custom conversation configuration YAML file",
+    ),
+    output_dir: str = typer.Option(
+        "./output", "--output-dir", "-o", help="Directory to save output files"
+    ),
 ):
     """
     Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
     """
     try:
         config = load_config()
-        main_config = config.config.get('main', {})
+        main_config = config.config.get("main", {})
         if tts_model is None:
-            tts_model = main_config.get('default_tts_model', 'openai')
+            tts_model = main_config.get("default_tts_model", "openai")
 
         urls_list = urls or []
         if file:
@@ -123,7 +161,7 @@ def main(
             tts_model=tts_model,
             generate_audio=not transcript_only,
             config=config,
-            conversation_config=conversation_config
+            conversation_config=conversation_config,
         )
 
         output_dir = Path(output_dir)
@@ -138,16 +176,20 @@ def main(
             podcast.save(str(audio_file))
             transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt"
             podcast.export_transcript(str(transcript_file))
-            typer.echo(f"Podcast generated successfully using {tts_model} TTS model: {audio_file}")
+            typer.echo(
+                f"Podcast generated successfully using {tts_model} TTS model: {audio_file}"
+            )
             typer.echo(f"Transcript saved to: {transcript_file}")
 
     except Exception as e:
         typer.echo(f"An error occurred: {str(e)}", err=True)
         raise typer.Exit(code=1)
 
+
 if __name__ == "__main__":
     app()
 
+
 def generate_podcast(
     urls: Optional[List[str]] = None,
     url_file: Optional[str] = None,
@@ -155,7 +197,7 @@ def generate_podcast(
     tts_model: Optional[str] = None,
     transcript_only: bool = False,
     config: Optional[Dict[str, Any]] = None,
-    conversation_config: Optional[Dict[str, Any]] = None
+    conversation_config: Optional[Dict[str, Any]] = None,
 ) -> Podcast:
     """
     Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
@@ -206,16 +248,18 @@ def generate_podcast(
             elif isinstance(config, Config):
                 default_config = config
             else:
-                raise ValueError("Config must be either a dictionary or a Config object")
+                raise ValueError(
+                    "Config must be either a dictionary or a Config object"
+                )
 
-        main_config = default_config.config.get('main', {})
+        main_config = default_config.config.get("main", {})
 
         if tts_model is None:
-            tts_model = main_config.get('default_tts_model', 'openai')
+            tts_model = main_config.get("default_tts_model", "openai")
 
         urls_list = urls or []
         if url_file:
-            with open(url_file, 'r') as file:
+            with open(url_file, "r") as file:
                 urls_list.extend([line.strip() for line in file if line.strip()])
 
         if not urls_list and not transcript_file:
@@ -229,11 +273,11 @@ def generate_podcast(
             tts_model=tts_model,
             generate_audio=not transcript_only,
             config=default_config,
-            conversation_config=conversation_config
+            conversation_config=conversation_config,
         )
 
         return podcast
 
     except Exception as e:
         logger.error(f"An error occurred: {str(e)}")
-        raise
\ No newline at end of file
+        raise
diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index 7d4c383e..9b422faf 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -30,7 +30,7 @@ def audio(self) -> PydubAudioSegment:
 
 
 class AudioManager:
-    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1) -> None:
+    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4) -> None:
         self.tts_backends = tts_backends
         self.n_jobs = n_jobs
         self.audio_segments = []
diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py
index 6e109b44..774e80b6 100644
--- a/podcastfy/text_to_speech.py
+++ b/podcastfy/text_to_speech.py
@@ -17,259 +17,280 @@
 
 logger = logging.getLogger(__name__)
 
+
 class TextToSpeech:
-	def __init__(self, model: str = 'openai', api_key: Optional[str] = None):
-		"""
-		Initialize the TextToSpeech class.
-
-		Args:
-			model (str): The model to use for text-to-speech conversion. 
-						 Options are 'elevenlabs' or 'openai'. Defaults to 'openai'.
-			api_key (Optional[str]): API key for the selected text-to-speech service.
-						   If not provided, it will be loaded from the config.
-		"""
-		self.model = model.lower()
-		self.config = load_config()
-		self.tts_config = self.config.get('text_to_speech')
-
-		if self.model == 'elevenlabs':
-			self.api_key = api_key or self.config.ELEVENLABS_API_KEY
-			self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key)
-		elif self.model == 'openai':
-			self.api_key = api_key or self.config.OPENAI_API_KEY
-			openai.api_key = self.api_key
-		else:
-			raise ValueError("Invalid model. Choose 'elevenlabs' or 'openai'.")
-
-		self.audio_format = self.tts_config['audio_format']
-		self.temp_audio_dir = self.tts_config['temp_audio_dir']
-		self.ending_message = self.tts_config['ending_message']
-
-		# Create temp_audio_dir if it doesn't exist
-		if not os.path.exists(self.temp_audio_dir):
-			os.makedirs(self.temp_audio_dir)
-
-	def __merge_audio_files(self, input_dir: str, output_file: str) -> None:
-		"""
-		Merge all audio files in the input directory sequentially and save the result.
-
-		Args:
-			input_dir (str): Path to the directory containing audio files.
-			output_file (str): Path to save the merged audio file.
-		"""
-		try:
-			# Function to sort filenames naturally
-			def natural_sort_key(filename: str) -> List[Union[int, str]]:
-				return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
-			
-			combined = AudioSegment.empty()
-			audio_files = sorted(
-				[f for f in os.listdir(input_dir) if f.endswith(f".{self.audio_format}")],
-				key=natural_sort_key
-			)
-			for file in audio_files:
-				if file.endswith(f".{self.audio_format}"):
-					file_path = os.path.join(input_dir, file)
-					combined += AudioSegment.from_file(file_path, format=self.audio_format)
-			
-			combined.export(output_file, format=self.audio_format)
-			logger.info(f"Merged audio saved to {output_file}")
-		except Exception as e:
-			logger.error(f"Error merging audio files: {str(e)}")
-			raise
-
-	def convert_to_speech(self, text: str, output_file: str) -> None:
-		"""
-		Convert input text to speech and save as an audio file.
-
-		Args:
-			text (str): Input text to convert to speech.
-			output_file (str): Path to save the output audio file.
-
-		Raises:
-			Exception: If there's an error in converting text to speech.
-		"""
-		# Clean TSS markup tags from the input text
-		cleaned_text = self.clean_tss_markup(text)
-
-		if self.model == 'elevenlabs':
-			self.__convert_to_speech_elevenlabs(cleaned_text, output_file)
-		elif self.model == 'openai':
-			self.__convert_to_speech_openai(cleaned_text, output_file)
-
-	def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None:
-		try:
-			qa_pairs = self.split_qa(text)
-			audio_files = []
-			counter = 0
-			for question, answer in qa_pairs:
-				question_audio = self.client.generate(
-					text=question,
-					voice=self.tts_config['elevenlabs']['default_voices']['question'],
-					model=self.tts_config['elevenlabs']['model']
-				)
-				answer_audio = self.client.generate(
-					text=answer,
-					voice=self.tts_config['elevenlabs']['default_voices']['answer'],
-					model=self.tts_config['elevenlabs']['model']
-				)
-
-				# Save question and answer audio chunks
-				for audio in [question_audio, answer_audio]:
-					counter += 1
-					file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
-					with open(file_name, "wb") as out:
-						for chunk in audio:
-							if chunk:
-								out.write(chunk)
-					audio_files.append(file_name)
-
-			# Merge all audio files and save the result
-			self.__merge_audio_files(self.temp_audio_dir, output_file)
-
-			# Clean up individual audio files
-			for file in audio_files:
-				os.remove(file)
-			
-			logger.info(f"Audio saved to {output_file}")
-
-		except Exception as e:
-			logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}")
-			raise
-
-	def __convert_to_speech_openai(self, text: str, output_file: str) -> None:
-		try:
-			qa_pairs = self.split_qa(text)
-			print(qa_pairs)
-			audio_files = []
-			counter = 0
-			for question, answer in qa_pairs:
-				for speaker, content in [
-					(self.tts_config['openai']['default_voices']['question'], question),
-					(self.tts_config['openai']['default_voices']['answer'], answer)
-				]:
-					counter += 1
-					file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
-					response = openai.audio.speech.create(
-						model=self.tts_config['openai']['model'],
-						voice=speaker,
-						input=content
-					)
-					with open(file_name, "wb") as file:
-						file.write(response.content)
-
-					audio_files.append(file_name)
-
-			# Merge all audio files and save the result
-			self.__merge_audio_files(self.temp_audio_dir, output_file)
-
-			# Clean up individual audio files
-			for file in audio_files:
-				os.remove(file)
-			
-			logger.info(f"Audio saved to {output_file}")
-
-		except Exception as e:
-			logger.error(f"Error converting text to speech with OpenAI: {str(e)}")
-			raise
-
-	def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
-		"""
-		Split the input text into question-answer pairs.
-
-		Args:
-			input_text (str): The input text containing Person1 and Person2 dialogues.
-
-		Returns:
-			List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
-		"""
-		# Add ending message to the end of input_text
-		input_text += f"<Person2>{self.ending_message}</Person2>"
-
-		# Regular expression pattern to match Person1 and Person2 dialogues
-		pattern = r'<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>'
-		
-		# Find all matches in the input text
-		matches = re.findall(pattern, input_text, re.DOTALL)
-		
-		# Process the matches to remove extra whitespace and newlines
-		processed_matches = [
-			(
-				' '.join(person1.split()).strip(),
-				' '.join(person2.split()).strip()
-			)
-			for person1, person2 in matches
-		]
-		return processed_matches
-
-	def clean_tss_markup(self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]) -> str:
-		"""
-		Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
-
-		Args:
-			input_text (str): The input text containing TSS markup tags.
-			additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"].
-
-		Returns:
-			str: Cleaned text with unsupported TSS markup tags removed.
-		"""
-		# List of SSML tags supported by both OpenAI and ElevenLabs
-		supported_tags = [
-			'speak', 'break', 'lang', 'p', 'phoneme',
-			's', 'say-as', 'sub'
-		]
-
-		# Append additional tags to the supported tags list
-		supported_tags.extend(additional_tags)
-
-		# Create a pattern that matches any tag not in the supported list
-		pattern = r'</?(?!(?:' + '|'.join(supported_tags) + r')\b)[^>]+>'
-
-		# Remove unsupported tags
-		cleaned_text = re.sub(pattern, '', input_text)
-
-		# Remove any leftover empty lines
-		cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
-
-		# Ensure closing tags for additional tags are preserved
-		for tag in additional_tags:
-			cleaned_text = re.sub(f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)', 
-								  f'<{tag}>\\1</{tag}>', 
-								  cleaned_text, 
-								  flags=re.DOTALL)
-
-		return cleaned_text.strip()
+    def __init__(self, model: str = "openai", api_key: Optional[str] = None):
+        """
+        Initialize the TextToSpeech class.
+
+        Args:
+                model (str): The model to use for text-to-speech conversion.
+                                         Options are 'elevenlabs' or 'openai'. Defaults to 'openai'.
+                api_key (Optional[str]): API key for the selected text-to-speech service.
+                                           If not provided, it will be loaded from the config.
+        """
+        self.model = model.lower()
+        self.config = load_config()
+        self.tts_config = self.config.get("text_to_speech")
+
+        if self.model == "elevenlabs":
+            self.api_key = api_key or self.config.ELEVENLABS_API_KEY
+            self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key)
+        elif self.model == "openai":
+            self.api_key = api_key or self.config.OPENAI_API_KEY
+            openai.api_key = self.api_key
+        else:
+            raise ValueError("Invalid model. Choose 'elevenlabs' or 'openai'.")
+
+        self.audio_format = self.tts_config["audio_format"]
+        self.temp_audio_dir = self.tts_config["temp_audio_dir"]
+        self.ending_message = self.tts_config["ending_message"]
+
+        # Create temp_audio_dir if it doesn't exist
+        if not os.path.exists(self.temp_audio_dir):
+            os.makedirs(self.temp_audio_dir)
+
+    def __merge_audio_files(self, input_dir: str, output_file: str) -> None:
+        """
+        Merge all audio files in the input directory sequentially and save the result.
+
+        Args:
+                input_dir (str): Path to the directory containing audio files.
+                output_file (str): Path to save the merged audio file.
+        """
+        try:
+            # Function to sort filenames naturally
+            def natural_sort_key(filename: str) -> List[Union[int, str]]:
+                return [
+                    int(text) if text.isdigit() else text
+                    for text in re.split(r"(\d+)", filename)
+                ]
+
+            combined = AudioSegment.empty()
+            audio_files = sorted(
+                [
+                    f
+                    for f in os.listdir(input_dir)
+                    if f.endswith(f".{self.audio_format}")
+                ],
+                key=natural_sort_key,
+            )
+            for file in audio_files:
+                if file.endswith(f".{self.audio_format}"):
+                    file_path = os.path.join(input_dir, file)
+                    combined += AudioSegment.from_file(
+                        file_path, format=self.audio_format
+                    )
+
+            combined.export(output_file, format=self.audio_format)
+            logger.info(f"Merged audio saved to {output_file}")
+        except Exception as e:
+            logger.error(f"Error merging audio files: {str(e)}")
+            raise
+
+    def convert_to_speech(self, text: str, output_file: str) -> None:
+        """
+        Convert input text to speech and save as an audio file.
+
+        Args:
+                text (str): Input text to convert to speech.
+                output_file (str): Path to save the output audio file.
+
+        Raises:
+                Exception: If there's an error in converting text to speech.
+        """
+        # Clean TSS markup tags from the input text
+        cleaned_text = self.clean_tss_markup(text)
+
+        if self.model == "elevenlabs":
+            self.__convert_to_speech_elevenlabs(cleaned_text, output_file)
+        elif self.model == "openai":
+            self.__convert_to_speech_openai(cleaned_text, output_file)
+
+    def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None:
+        try:
+            qa_pairs = self.split_qa(text)
+            audio_files = []
+            counter = 0
+            for question, answer in qa_pairs:
+                question_audio = self.client.generate(
+                    text=question,
+                    voice=self.tts_config["elevenlabs"]["default_voices"]["question"],
+                    model=self.tts_config["elevenlabs"]["model"],
+                )
+                answer_audio = self.client.generate(
+                    text=answer,
+                    voice=self.tts_config["elevenlabs"]["default_voices"]["answer"],
+                    model=self.tts_config["elevenlabs"]["model"],
+                )
+
+                # Save question and answer audio chunks
+                for audio in [question_audio, answer_audio]:
+                    counter += 1
+                    file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
+                    with open(file_name, "wb") as out:
+                        for chunk in audio:
+                            if chunk:
+                                out.write(chunk)
+                    audio_files.append(file_name)
+
+            # Merge all audio files and save the result
+            self.__merge_audio_files(self.temp_audio_dir, output_file)
+
+            # Clean up individual audio files
+            for file in audio_files:
+                os.remove(file)
+
+            logger.info(f"Audio saved to {output_file}")
+
+        except Exception as e:
+            logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}")
+            raise
+
+    def __convert_to_speech_openai(self, text: str, output_file: str) -> None:
+        try:
+            qa_pairs = self.split_qa(text)
+            print(qa_pairs)
+            audio_files = []
+            counter = 0
+            for question, answer in qa_pairs:
+                for speaker, content in [
+                    (self.tts_config["openai"]["default_voices"]["question"], question),
+                    (self.tts_config["openai"]["default_voices"]["answer"], answer),
+                ]:
+                    counter += 1
+                    file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
+                    response = openai.audio.speech.create(
+                        model=self.tts_config["openai"]["model"],
+                        voice=speaker,
+                        input=content,
+                    )
+                    with open(file_name, "wb") as file:
+                        file.write(response.content)
+
+                    audio_files.append(file_name)
+
+            # Merge all audio files and save the result
+            self.__merge_audio_files(self.temp_audio_dir, output_file)
+
+            # Clean up individual audio files
+            for file in audio_files:
+                os.remove(file)
+
+            logger.info(f"Audio saved to {output_file}")
+
+        except Exception as e:
+            logger.error(f"Error converting text to speech with OpenAI: {str(e)}")
+            raise
+
+    def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
+        """
+        Split the input text into question-answer pairs.
+
+        Args:
+                input_text (str): The input text containing Person1 and Person2 dialogues.
+
+        Returns:
+                List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
+        """
+        # Add ending message to the end of input_text
+        input_text += f"<Person2>{self.ending_message}</Person2>"
+
+        # Regular expression pattern to match Person1 and Person2 dialogues
+        pattern = r"<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>"
+
+        # Find all matches in the input text
+        matches = re.findall(pattern, input_text, re.DOTALL)
+
+        # Process the matches to remove extra whitespace and newlines
+        processed_matches = [
+            (" ".join(person1.split()).strip(), " ".join(person2.split()).strip())
+            for person1, person2 in matches
+        ]
+        return processed_matches
+
+    def clean_tss_markup(
+        self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]
+    ) -> str:
+        """
+        Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
+
+        Args:
+                input_text (str): The input text containing TSS markup tags.
+                additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"].
+
+        Returns:
+                str: Cleaned text with unsupported TSS markup tags removed.
+        """
+        # List of SSML tags supported by both OpenAI and ElevenLabs
+        supported_tags = [
+            "speak",
+            "break",
+            "lang",
+            "p",
+            "phoneme",
+            "s",
+            "say-as",
+            "sub",
+        ]
+
+        # Append additional tags to the supported tags list
+        supported_tags.extend(additional_tags)
+
+        # Create a pattern that matches any tag not in the supported list
+        pattern = r"</?(?!(?:" + "|".join(supported_tags) + r")\b)[^>]+>"
+
+        # Remove unsupported tags
+        cleaned_text = re.sub(pattern, "", input_text)
+
+        # Remove any leftover empty lines
+        cleaned_text = re.sub(r"\n\s*\n", "\n", cleaned_text)
+
+        # Ensure closing tags for additional tags are preserved
+        for tag in additional_tags:
+            cleaned_text = re.sub(
+                f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)',
+                f"<{tag}>\\1</{tag}>",
+                cleaned_text,
+                flags=re.DOTALL,
+            )
+
+        return cleaned_text.strip()
+
 
 def main(seed: int = 42) -> None:
-	"""
-	Main function to test the TextToSpeech class.
-
-	Args:
-		seed (int): Random seed for reproducibility. Defaults to 42.
-	"""
-	try:
-		# Load configuration
-		config = load_config()
-
-		# Read input text from file
-		with open('tests/data/response.txt', 'r') as file:
-			input_text = file.read()
-
-		# Test ElevenLabs
-		tts_elevenlabs = TextToSpeech(model='elevenlabs')
-		elevenlabs_output_file = 'tests/data/response_elevenlabs.mp3'
-		tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file)
-		logger.info(f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}")
-
-		# Test OpenAI
-		tts_openai = TextToSpeech(model='openai')
-		openai_output_file = 'tests/data/response_openai.mp3'
-		tts_openai.convert_to_speech(input_text, openai_output_file)
-		logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}")
-
-	except Exception as e:
-		logger.error(f"An error occurred during text-to-speech conversion: {str(e)}")
-		raise
+    """
+    Main function to test the TextToSpeech class.
+
+    Args:
+            seed (int): Random seed for reproducibility. Defaults to 42.
+    """
+    try:
+        # Load configuration
+        config = load_config()
+
+        # Read input text from file
+        with open("tests/data/response.txt", "r") as file:
+            input_text = file.read()
+
+        # Test ElevenLabs
+        tts_elevenlabs = TextToSpeech(model="elevenlabs")
+        elevenlabs_output_file = "tests/data/response_elevenlabs.mp3"
+        tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file)
+        logger.info(
+            f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}"
+        )
+
+        # Test OpenAI
+        tts_openai = TextToSpeech(model="openai")
+        openai_output_file = "tests/data/response_openai.mp3"
+        tts_openai.convert_to_speech(input_text, openai_output_file)
+        logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}")
+
+    except Exception as e:
+        logger.error(f"An error occurred during text-to-speech conversion: {str(e)}")
+        raise
+
 
 if __name__ == "__main__":
-	main(seed=42)
\ No newline at end of file
+    main(seed=42)
diff --git a/tests/test_content_parser.py b/tests/test_content_parser.py
index 19a7ebce..e31bf7a6 100644
--- a/tests/test_content_parser.py
+++ b/tests/test_content_parser.py
@@ -81,4 +81,4 @@ def test_pdf_extractor(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_genai_podcast.py b/tests/test_genai_podcast.py
index a3ab67b3..deba76c0 100644
--- a/tests/test_genai_podcast.py
+++ b/tests/test_genai_podcast.py
@@ -7,7 +7,7 @@
 from podcastfy.utils.config import Config
 from podcastfy.utils.config_conversation import ConversationConfig
 
-#TODO: Should be a fixture
+# TODO: Should be a fixture
 def sample_conversation_config():
     conversation_config = {
         "word_count": 2000,
@@ -19,10 +19,11 @@ def sample_conversation_config():
         "podcast_tagline": "Learning Through Conversation",
         "output_language": "English",
         "engagement_techniques": ["examples", "questions", "case studies"],
-        "creativity": 0
+        "creativity": 0,
     }
     return conversation_config
 
+
 class TestGenAIPodcast(unittest.TestCase):
     def setUp(self):
         """
@@ -31,9 +32,6 @@ def setUp(self):
         config = Config()
         self.api_key = config.GEMINI_API_KEY
 
-
-
-    
     def test_generate_qa_content(self):
         """
         Test the generate_qa_content method of ContentGenerator.
@@ -56,21 +54,29 @@ def test_custom_conversation_config(self):
         conversation_config = sample_conversation_config()
         content_generator = ContentGenerator(self.api_key, conversation_config)
         input_text = "Artificial Intelligence in Education"
-        
+
         result = content_generator.generate_qa_content(input_text)
 
         self.assertIsNotNone(result)
         self.assertNotEqual(result, "")
         self.assertIsInstance(result, str)
-        
+
         # Check for elements from the custom config
         self.assertIn(conversation_config["podcast_name"], result)
         self.assertIn(conversation_config["podcast_tagline"], result)
-        self.assertTrue(any(role in result.lower() for role in [conversation_config["roles_person1"], 
-                                                                conversation_config["roles_person2"]]))
-        
+        self.assertTrue(
+            any(
+                role in result.lower()
+                for role in [
+                    conversation_config["roles_person1"],
+                    conversation_config["roles_person2"],
+                ]
+            )
+        )
+
         # Check word count (allow some flexibility)
         word_count = len(result.split())
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py
index 9f65c749..aa9d152a 100644
--- a/tests/test_generate_podcast.py
+++ b/tests/test_generate_podcast.py
@@ -5,99 +5,112 @@
 from podcastfy.utils.config import load_config
 from podcastfy.utils.config_conversation import load_conversation_config
 
+
 @pytest.fixture
 def sample_config():
-	config = load_config()
-	config.configure(
-		output_directories={
-			'audio': 'tests/data/audio',
-			'transcripts': 'tests/data/transcripts'
-		}
-	)
-	return config
+    config = load_config()
+    config.configure(
+        output_directories={
+            "audio": "tests/data/audio",
+            "transcripts": "tests/data/transcripts",
+        }
+    )
+    return config
+
 
 @pytest.fixture
 def sample_conversation_config():
-	conversation_config = {
-		"word_count": 300,
-		"conversation_style": ["formal", "educational"],
-		"roles_person1": "professor",
-		"roles_person2": "student",
-		"dialogue_structure": ["Introduction", "Main Points", "Case Studies", "Quiz", "Conclusion"],
-		"podcast_name": "Teachfy",
-		"podcast_tagline": "Learning Through Conversation",
-		"output_language": "English",
-		"engagement_techniques": ["examples", "questions"],
-		"creativity": 0
-	}
-	return conversation_config
+    conversation_config = {
+        "word_count": 300,
+        "conversation_style": ["formal", "educational"],
+        "roles_person1": "professor",
+        "roles_person2": "student",
+        "dialogue_structure": [
+            "Introduction",
+            "Main Points",
+            "Case Studies",
+            "Quiz",
+            "Conclusion",
+        ],
+        "podcast_name": "Teachfy",
+        "podcast_tagline": "Learning Through Conversation",
+        "output_language": "English",
+        "engagement_techniques": ["examples", "questions"],
+        "creativity": 0,
+    }
+    return conversation_config
+
 
 def test_generate_podcast_from_urls(sample_config):
-	"""Test generating a podcast from a list of URLs."""
-	urls = [
-		"https://en.wikipedia.org/wiki/Podcast",
-		"https://en.wikipedia.org/wiki/Text-to-speech"
-	]
-	
-	audio_file = generate_podcast(
-		urls=urls,
-		config=sample_config
-	)
-
-	assert audio_file is not None
-	assert os.path.exists(audio_file)
-	assert audio_file.endswith('.mp3')
-	assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio')
+    """Test generating a podcast from a list of URLs."""
+    urls = [
+        "https://en.wikipedia.org/wiki/Podcast",
+        "https://en.wikipedia.org/wiki/Text-to-speech",
+    ]
+
+    audio_file = generate_podcast(urls=urls, config=sample_config)
+
+    assert audio_file is not None
+    assert os.path.exists(audio_file)
+    assert audio_file.endswith(".mp3")
+    assert os.path.dirname(audio_file) == sample_config.get(
+        "output_directories", {}
+    ).get("audio")
+
 
 def test_generate_transcript_only(sample_config):
-	"""Test generating only a transcript without audio."""
-	urls = ["https://en.wikipedia.org/wiki/Natural_language_processing"]
-	
-	result = generate_podcast(
-		urls=urls,
-		transcript_only=True,
-		config=sample_config
-	)
-	
-	assert result is None
+    """Test generating only a transcript without audio."""
+    urls = ["https://en.wikipedia.org/wiki/Natural_language_processing"]
+
+    result = generate_podcast(urls=urls, transcript_only=True, config=sample_config)
+
+    assert result is None
+
 
 def test_generate_podcast_from_transcript_file(sample_config):
-	"""Test generating a podcast from an existing transcript file."""
-	# First, generate a transcript
-	transcript_file = os.path.join(sample_config.get('output_directories', {}).get('transcripts'), 'test_transcript.txt')
-	with open(transcript_file, 'w') as f:
-		f.write("<Person1>Joe Biden and the US Politics<Person1><Person2>Joe Biden is the current president of the United States of America<Person2>")
-	
-	# Now use this transcript to generate a podcast
-	audio_file = generate_podcast(
-		transcript_file=transcript_file,
-		config=sample_config
-	)
-	
-	assert audio_file is not None
-	assert os.path.exists(audio_file)
-	assert audio_file.endswith('.mp3')
-	assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio')
+    """Test generating a podcast from an existing transcript file."""
+    # First, generate a transcript
+    transcript_file = os.path.join(
+        sample_config.get("output_directories", {}).get("transcripts"),
+        "test_transcript.txt",
+    )
+    with open(transcript_file, "w") as f:
+        f.write(
+            "<Person1>Joe Biden and the US Politics<Person1><Person2>Joe Biden is the current president of the United States of America<Person2>"
+        )
+
+    # Now use this transcript to generate a podcast
+    audio_file = generate_podcast(transcript_file=transcript_file, config=sample_config)
+
+    assert audio_file is not None
+    assert os.path.exists(audio_file)
+    assert audio_file.endswith(".mp3")
+    assert os.path.dirname(audio_file) == sample_config.get(
+        "output_directories", {}
+    ).get("audio")
+
 
 def test_generate_podcast_with_custom_config(sample_config, sample_conversation_config):
-	"""Test generating a podcast with a custom conversation config."""
-	urls = ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
-	
-	audio_file = generate_podcast(
-		urls=urls,
-		config=sample_config,
-		conversation_config=sample_conversation_config
-	)
-	
-	assert audio_file is not None
-	assert os.path.exists(audio_file)
-	assert audio_file.endswith('.mp3')
-	assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio')
-	
+    """Test generating a podcast with a custom conversation config."""
+    urls = ["https://en.wikipedia.org/wiki/Artificial_intelligence"]
+
+    audio_file = generate_podcast(
+        urls=urls, config=sample_config, conversation_config=sample_conversation_config
+    )
+
+    assert audio_file is not None
+    assert os.path.exists(audio_file)
+    assert audio_file.endswith(".mp3")
+    assert os.path.dirname(audio_file) == sample_config.get(
+        "output_directories", {}
+    ).get("audio")
+
+
 def test_generate_podcast_no_urls_or_transcript():
-	"""Test that an error is raised when no URLs or transcript file is provided."""
-	with pytest.raises(ValueError):
-		generate_podcast()
+    """Test that an error is raised when no URLs or transcript file is provided."""
+    with pytest.raises(ValueError):
+        generate_podcast()
+
 
 if __name__ == "__main__":
-	pytest.main()
\ No newline at end of file
+    pytest.main()

From c1adb9b5eb6ae884fefbda69c1819ad281c4f95f Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 14:53:44 +0200
Subject: [PATCH 06/49] fix transcript parsing

---
 podcastfy/aiengines/llm/gemini_langchain.py | 65 +++++++++++++++++----
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
index 4e08b0af..9380a4f6 100644
--- a/podcastfy/aiengines/llm/gemini_langchain.py
+++ b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -7,6 +7,7 @@
 """
 
 import os
+import re
 from typing import Optional, Dict, Any, List, Tuple
 
 from langchain_google_genai import ChatGoogleGenerativeAI
@@ -41,12 +42,12 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 		self.llm = ChatGoogleGenerativeAI(
 			model=self.content_generator_config.get('gemini_model', 'gemini-1.5-pro-latest'),
 			temperature=self.config_conversation.get('creativity', 0),
-			max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192)
+			max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192),
 		)
 		
 		#pick podcastfy prompt from langchain hub
 		self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_'))
-		self.prompt_template
+		self.ending_message = self.config.get('text_to_speech')['ending_message']
 
 		self.parser = StrOutputParser()
 		
@@ -109,20 +110,62 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 		"""
 		self.content_generator = ContentGenerator(api_key, conversation_config)
 
+	def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
+		"""
+		Split the input text into question-answer pairs.
+
+		Args:
+			input_text (str): The input text containing Person1 and Person2 dialogues.
+
+		Returns:
+			List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
+		"""
+		# Add ending message to the end of input_text
+		input_text += f"<Person2>{self.content_generator.ending_message}</Person2>"
+
+		# Regular expression pattern to match Person1 and Person2 dialogues
+		pattern = r'<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>'
+
+		# Find all matches in the input text
+		matches = re.findall(pattern, input_text, re.DOTALL)
+
+		# Process the matches to remove extra whitespace and newlines
+		processed_matches = [
+			(
+				' '.join(person1.split()).strip(),
+				' '.join(person2.split()).strip()
+			)
+			for person1, person2 in matches
+		]
+		return processed_matches
+
 	def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
 		content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
-		
-		# Parse the generated content into the required format
+
+		q_a_pairs = self.split_qa(content)
 		transcript = []
-		for line in content.split('\n'):
-			if ':' in line:
-				speaker_name, text = line.split(':', 1)
-				speaker = next((char for char in characters if char.name == speaker_name.strip()), None)
-				if speaker:
-					transcript.append((speaker, text.strip()))
-		
+		for q_a_pair in q_a_pairs:
+			# Assign the speakers based on the order of the characters
+			speaker1, speaker2 = characters
+			speaker_1_text, speaker_2_text = q_a_pair
+			transcript.append((speaker1, speaker_1_text))
+			transcript.append((speaker2, speaker_2_text))
 		return transcript
 
+	# def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
+	# 	content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
+	#
+	# 	# Parse the generated content into the required format
+	# 	transcript = []
+	# 	for line in content.split('\n'):
+	# 		if ':' in line:
+	# 			speaker_name, text = line.split(':', 1)
+	# 			speaker = next((char for char in characters if char.name == speaker_name.strip()), None)
+	# 			if speaker:
+	# 				transcript.append((speaker, text.strip()))
+	#
+	# 	return transcript
+
 
 
 def main(seed: int = 42) -> None:

From d06b93c3be3adc0c73a53d3e8196400f1355cdb9 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 14:54:22 +0200
Subject: [PATCH 07/49] fix eleven labs issues

---
 podcastfy/aiengines/tts/tts_backends.py | 30 ++++++++++++++++---------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 0b2d389c..1e4d4125 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -15,12 +15,12 @@
 from podcastfy.core.character import Character
 
 
-class ElevenLabsTTS(SyncTTSBackend, TTSConfigMixin):
+class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin):
     name: str = "elevenlabs"
 
     def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'):
         # TODO: not the right path for final client
-        TTSConfigMixin.__init__(self, config_file)
+        TTSConfigMixin.__init__(self, config_file, name=self.name)
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
 
     def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
@@ -37,6 +37,19 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
                     out.write(chunk)
         return output_path
 
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        config = self.tts_config_for_character(character)
+        client = elevenlabs_client.AsyncElevenLabs(api_key=self.api_key)
+        content = await client.generate(
+            text=text,
+            voice=config.voice,
+            model=config.extra_args.get('model', self.get_default_config().get('model', 'default'))
+        )
+        with open(output_path, "wb") as out:
+            for chunk in content:
+                if chunk:
+                    out.write(chunk)
+
 
 class OpenAITTS(SyncTTSBackend, TTSConfigMixin):
     name: str = "openai"
@@ -45,7 +58,7 @@ def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yam
         TTSConfigMixin.__init__(self, config_file, name=self.name)
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
 
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
         config = self.tts_config_for_character(character)
 
         print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice} \n text: {text}")
@@ -57,23 +70,20 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
         )
         with open(output_path, "wb") as file:
             file.write(response.content)
-        return output_path
+
 
 
 class EdgeTTS(AsyncTTSBackend, TTSConfigMixin):
-    name: str = "edge-tts"
+    name: str = "edge"
 
     def __init__(self, config_file: str = 'podcastfy/config.yaml'):
-        TTSConfigMixin.__init__(self, config_file)
+        TTSConfigMixin.__init__(self, config_file, name=self.name)
 
-    async def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
         config = self.tts_config_for_character(character)
         communicate = edge_tts.Communicate(text, config.voice)
         await communicate.save(output_path)
-        return output_path
 
-    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-        return await self.text_to_speech(text, character, output_path)
 
 
 

From 1e158513f057990e66f398d7df4a39fa711ce136 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 14:54:47 +0200
Subject: [PATCH 08/49] fix person names

---
 podcastfy/client_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index ea502d6d..adf2f1b5 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -21,7 +21,7 @@
 
 def create_characters(config: Dict[str, Any]) -> List[Character]:
     host = Character(
-        name="Host",
+        name="Person1",
         role="Podcast host",
         tts_configs={
             "openai": TTSConfig(
@@ -39,7 +39,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]:
     )
 
     guest = Character(
-        name="Guest",
+        name="Person2",
         role="Expert guest",
         tts_configs={
             "openai": TTSConfig(

From 114172419463647a69dae0c78bc00088572ca401 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 14:55:05 +0200
Subject: [PATCH 09/49] add edge default values

---
 podcastfy/config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/podcastfy/config.yaml b/podcastfy/config.yaml
index 624c8955..00bff294 100644
--- a/podcastfy/config.yaml
+++ b/podcastfy/config.yaml
@@ -13,6 +13,10 @@ text_to_speech:
       question: "echo"
       answer: "shimmer"
     model: "tts-1-hd"
+  edge:
+    default_voices:
+      question: "en-US-JennyNeural"
+      answer: "en-US-EricNeural"
   audio_format: "mp3"
   temp_audio_dir: "data/audio/tmp/"
   ending_message: "Bye Bye!"

From c44139b2a4df6368010ce2dde112006689218d77 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 14:56:07 +0200
Subject: [PATCH 10/49] fix multiple issues with audio

---
 podcastfy/core/audio.py   | 82 ++++++++++++++++++++-------------------
 podcastfy/core/podcast.py | 16 +++++---
 2 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index 9b422faf..2e57f394 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -1,91 +1,95 @@
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import Optional, Dict, Union, List, cast
+from typing import Optional, Dict, Union, List, cast, Tuple
 
-from pydub import AudioSegment as PydubAudioSegment
+from pydub import AudioSegment
 
 from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend
 from podcastfy.core.transcript import TranscriptSegment, Transcript
 
 
-class AudioSegment:
+class PodcastsAudioSegment:
     """Represents an audio segment of the podcast."""
 
-    def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None) -> None:
+    def __init__(self, filepath: Path, transcript_segment: Optional[TranscriptSegment] = None) -> None:
         self.filepath = filepath
-        self.length_ms = length_ms
         self.transcript_segment = transcript_segment
-        self._audio: Optional[PydubAudioSegment] = None
+        self._audio: Optional[AudioSegment] = None
 
     @property
-    def audio(self) -> PydubAudioSegment:
+    def audio(self) -> AudioSegment:
         """Lazy-load the audio segment."""
         if self._audio is None:
-            self._audio = PydubAudioSegment.from_file(self.filepath)
-            if len(self._audio) != self.length_ms:
-                raise ValueError(
-                    f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)")
+            self._audio = AudioSegment.from_file(self.filepath)
         return self._audio
 
 
 class AudioManager:
-    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4) -> None:
+    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4, file_prefix: str = "") -> None:
         self.tts_backends = tts_backends
         self.n_jobs = n_jobs
+        self.has_async_backend = any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values())
+        self.file_prefix = file_prefix
         self.audio_segments = []
-        self.final_audio: Optional[PydubAudioSegment] = None
+        self.final_audio: Optional[AudioSegment] = None
         self.temp_dir: Optional[Union[str, Path]] = None
 
-    async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        async def process_segment(segment: TranscriptSegment):
+    async def _async_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
+        async def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
+            segment, index = segment_tuple
             tts_backend = self.get_tts_backend(segment)
-            audio_file = await cast(AsyncTTSBackend, tts_backend).async_text_to_speech(
-                segment.text,
-                segment.speaker,
-                Path(self.temp_dir) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3"
-            )
-            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment)
+            audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.mp3"
+            if isinstance(tts_backend, AsyncTTSBackend):
+                await tts_backend.async_text_to_speech(
+                    segment.text,
+                    segment.speaker,
+                    audio_path
+                )
+            else:
+                tts_backend.text_to_speech(
+                    segment.text,
+                    segment.speaker,
+                    audio_path
+                )
+            return PodcastsAudioSegment(audio_path, segment)
 
         semaphore = asyncio.Semaphore(self.n_jobs)
 
-        async def bounded_process_segment(segment):
+        async def bounded_process_segment(segment_tuple):
             async with semaphore:
-                return await process_segment(segment)
+                return await process_segment(segment_tuple)
 
-        tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments]
+        tasks = [asyncio.create_task(bounded_process_segment((segment, i))) for i, segment in enumerate(transcript.segments)]
         return list(await asyncio.gather(*tasks))
 
     def get_tts_backend(self, segment):
-        if segment.speaker.preferred_tts is None:
-            # take the first available TTS backend
+        tts_backend = self.tts_backends.get(segment.speaker.preferred_tts)
+        if tts_backend is None:
+            # Take the first available TTS backend
             tts_backend = next(iter(self.tts_backends.values()))
-        else:
-            tts_backend = self.tts_backends[segment.speaker.preferred_tts]
-            # ensure the preferred TTS backend is available
-            if tts_backend is None:
-                raise ValueError(f"Preferred TTS backend '{segment.speaker.preferred_tts}' is not available for character '{segment.speaker.name}'")
         return tts_backend
 
-    def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        def process_segment(segment: TranscriptSegment):
+    def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
+        def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
+            segment, index = segment_tuple
             tts_backend = self.get_tts_backend(segment)
             audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech(
                 segment.text,
                 segment.speaker,
-                Path(str(self.temp_dir)) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3"
+                Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3"
             )
-            return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment)
+            return PodcastsAudioSegment(audio_file, segment)
 
 
         with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
             return list(executor.map(process_segment, transcript.segments))
 
-    def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]:
-        if all(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()):
+    def create_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
+        if self.has_async_backend:
             return asyncio.run(self._async_build_audio_segments(transcript))
         else:
             return self._sync_build_audio_segments(transcript)
 
-    def stitch_audio_segments(self) -> None:
-        self.final_audio = sum([segment.audio for segment in self.audio_segments])
+    # def stitch_audio_segments(self) -> None:
+    #     self.final_audio = sum((segment.audio for segment in self.audio_segments), AudioSegment.empty())
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 06a5e47c..e9797b06 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -3,13 +3,13 @@
 from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, cast
 from tempfile import TemporaryDirectory
 import atexit
-from pydub import AudioSegment as PydubAudioSegment
+from pydub import AudioSegment
 from functools import wraps
 from contextlib import contextmanager
 
 from podcastfy.aiengines.llm.base import LLMBackend
 from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend
-from podcastfy.core.audio import AudioSegment, AudioManager
+from podcastfy.core.audio import PodcastsAudioSegment, AudioManager
 from podcastfy.core.character import Character
 from podcastfy.core.transcript import TranscriptSegment, Transcript
 from podcastfy.core.tts_configs import TTSConfig
@@ -93,7 +93,7 @@ def __init__(self, content: str, llm_backend: LLMBackend,
 
         # Initialize attributes with null values
         self.transcript: Optional[Transcript] = None
-        self.audio_segments: List[AudioSegment] = []
+        self.audio_segments: List[PodcastsAudioSegment] = []
         self.audio: Optional[PydubAudioSegment] = None
 
         # Define the sequence of methods to be called for each stage
@@ -195,10 +195,14 @@ def build_audio_segments(self) -> None:
     @podcast_stage
     def stitch_audio_segments(self) -> None:
         """Stitch all audio segments together to form the final podcast audio."""
-        self.audio = sum([segment.audio for segment in self.audio_segments])
+        # order segments by filename
+        segments_to_stitch = sorted(self.audio_segments, key=lambda segment: segment.filepath)
+
+        self.audio = sum((segment.audio for segment in segments_to_stitch), AudioSegment.empty())
 
     def _build_next_stage(self) -> bool:
         """Build the next stage of the podcast."""
+        print("state: ", self.state)
         if self.state == PodcastState.STITCHED:
             return False
 
@@ -338,8 +342,8 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
         PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3")
 
     with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT):
-        new_segment = AudioSegment(Path(temp_file.name), 500,
-                                   TranscriptSegment("New audio segment", podcast.characters["Host"]))
+        new_segment = PodcastsAudioSegment(Path(temp_file.name), 500,
+                                           TranscriptSegment("New audio segment", podcast.characters["Host"]))
         podcast.audio_segments.insert(0, new_segment)
 
     # Save the final podcast

From 8d689306266afae8f676a65e8dfda6be112714aa Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 17:10:19 +0200
Subject: [PATCH 11/49] commit before merge

---
 podcastfy/aiengines/llm/gemini_langchain.py | 199 +++++++++++++++++++-
 1 file changed, 189 insertions(+), 10 deletions(-)

diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
index 9380a4f6..23de41fb 100644
--- a/podcastfy/aiengines/llm/gemini_langchain.py
+++ b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -10,6 +10,8 @@
 import re
 from typing import Optional, Dict, Any, List, Tuple
 
+from langchain_community.llms.llamafile import Llamafile
+from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.output_parsers import StrOutputParser
 from langchain import hub
@@ -22,7 +24,10 @@
 
 logger = logging.getLogger(__name__)
 
-class ContentGenerator:
+
+
+class OldContentGenerator:
+	# note: to be deleted but stays around few days for reference and troubleshooting
 	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None):
 		"""
 		Initialize the ContentGenerator.
@@ -34,7 +39,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 		os.environ["GOOGLE_API_KEY"] = api_key
 		self.config = load_config()
 		self.content_generator_config = self.config.get('content_generator', {})
-		
+
 		# Load default conversation config and update with custom config if provided
 
 		self.config_conversation = load_conversation_config(conversation_config)
@@ -44,13 +49,13 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 			temperature=self.config_conversation.get('creativity', 0),
 			max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192),
 		)
-		
+
 		#pick podcastfy prompt from langchain hub
 		self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_'))
 		self.ending_message = self.config.get('text_to_speech')['ending_message']
 
 		self.parser = StrOutputParser()
-		
+
 		self.chain = (self.prompt_template | self.llm | self.parser)
 
 	def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str:
@@ -69,8 +74,8 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] =
 		"""
 		assert len(characters) == 2, "The number of characters should be 2 for this implementation"
 		try:
-			
-			
+
+
 			prompt_params = {
 				"input_text": input_texts,
 				"word_count": self.config_conversation.get('word_count'),
@@ -85,19 +90,192 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] =
 			}
 
 			self.response = self.chain.invoke(prompt_params)
-			
+
 			logger.info(f"Content generated successfully")
-			
+
 			if output_filepath:
 				with open(output_filepath, 'w') as file:
 					file.write(self.response)
 				logger.info(f"Response content saved to {output_filepath}")
-			
+
 			return self.response
 		except Exception as e:
 			logger.error(f"Error generating content: {str(e)}")
 			raise
 
+class LLMBackend:
+    def __init__(
+        self,
+        is_local: bool,
+        temperature: float,
+        max_output_tokens: int,
+        model_name: str,
+    ):
+        """
+        Initialize the LLMBackend.
+
+        Args:
+                is_local (bool): Whether to use a local LLM or not.
+                temperature (float): The temperature for text generation.
+                max_output_tokens (int): The maximum number of output tokens.
+                model_name (str): The name of the model to use.
+        """
+        self.is_local = is_local
+        self.temperature = temperature
+        self.max_output_tokens = max_output_tokens
+        self.model_name = model_name
+        self.is_multimodal = not is_local  # Does not assume local LLM is multimodal
+
+        if is_local:
+            self.llm = Llamafile()
+        else:
+            self.llm = ChatGoogleGenerativeAI(
+                model=model_name,
+                temperature=temperature,
+                max_output_tokens=max_output_tokens,
+            )
+
+
+class ContentGenerator:
+    def __init__(
+        self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Initialize the ContentGenerator.
+
+        Args:
+                api_key (str): API key for Google's Generative AI.
+                conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
+        """
+        os.environ["GOOGLE_API_KEY"] = api_key
+        self.config = load_config()
+        self.content_generator_config = self.config.get("content_generator", {})
+
+        self.config_conversation = load_conversation_config(conversation_config)
+
+    def __compose_prompt(self, num_images: int):
+        """
+        Compose the prompt for the LLM based on the content list.
+        """
+        prompt_template = hub.pull(
+            self.config.get("content_generator", {}).get(
+                "prompt_template", "souzatharsis/podcastfy_multimodal"
+            )
+        )
+
+        image_path_keys = []
+        messages = []
+        text_content = {"type": "text", "text": "{input_text}"}
+        messages.append(text_content)
+        for i in range(num_images):
+            key = f"image_path_{i}"
+            image_content = {
+                "image_url": {"path": f"{{{key}}}", "detail": "high"},
+                "type": "image_url",
+            }
+            image_path_keys.append(key)
+            messages.append(image_content)
+
+        user_prompt_template = ChatPromptTemplate.from_messages(
+            messages=[HumanMessagePromptTemplate.from_template(messages)]
+        )
+
+        # Compose messages from podcastfy_prompt_template and user_prompt_template
+        combined_messages = prompt_template.messages + user_prompt_template.messages
+
+        # Create a new ChatPromptTemplate object with the combined messages
+        composed_prompt_template = ChatPromptTemplate.from_messages(combined_messages)
+
+        return composed_prompt_template, image_path_keys
+
+    def __compose_prompt_params(
+        self, image_file_paths: List[str], image_path_keys: List[str], input_texts: str
+    ):
+        prompt_params = {
+            "input_text": input_texts,
+            "word_count": self.config_conversation.get("word_count"),
+            "conversation_style": ", ".join(
+                self.config_conversation.get("conversation_style", [])
+            ),
+            "roles_person1": self.config_conversation.get("roles_person1"),
+            "roles_person2": self.config_conversation.get("roles_person2"),
+            "dialogue_structure": ", ".join(
+                self.config_conversation.get("dialogue_structure", [])
+            ),
+            "podcast_name": self.config_conversation.get("podcast_name"),
+            "podcast_tagline": self.config_conversation.get("podcast_tagline"),
+            "output_language": self.config_conversation.get("output_language"),
+            "engagement_techniques": ", ".join(
+                self.config_conversation.get("engagement_techniques", [])
+            ),
+        }
+
+        # for each image_path_key, add the corresponding image_file_path to the prompt_params
+        for key, path in zip(image_path_keys, image_file_paths):
+            prompt_params[key] = path
+
+        return prompt_params
+
+    def generate_qa_content(
+        self,
+        input_texts: str = "",
+        image_file_paths: List[str] = [],
+        output_filepath: Optional[str] = None,
+        is_local: bool = False,
+    ) -> str:
+        """
+        Generate Q&A content based on input texts.
+
+        Args:
+                input_texts (str): Input texts to generate content from.
+                image_file_paths (List[str]): List of image file paths.
+                output_filepath (Optional[str]): Filepath to save the response content. Defaults to None.
+                is_local (bool): Whether to use a local LLM or not. Defaults to False.
+
+        Returns:
+                str: Formatted Q&A content.
+
+        Raises:
+                Exception: If there's an error in generating content.
+        """
+        try:
+            llmbackend = LLMBackend(
+                is_local=is_local,
+                temperature=self.config_conversation.get("creativity", 0),
+                max_output_tokens=self.content_generator_config.get(
+                    "max_output_tokens", 8192
+                ),
+                model_name=(
+                    self.content_generator_config.get(
+                        "gemini_model", "gemini-1.5-pro-latest"
+                    )
+                    if not is_local
+                    else "User provided model"
+                ),
+            )
+
+            num_images = 0 if is_local else len(image_file_paths)
+            self.prompt_template, image_path_keys = self.__compose_prompt(num_images)
+            self.parser = StrOutputParser()
+            self.chain = self.prompt_template | llmbackend.llm | self.parser
+
+            prompt_params = self.__compose_prompt_params(
+                image_file_paths, image_path_keys, input_texts
+            )
+
+            self.response = self.chain.invoke(prompt_params)
+
+            logger.info(f"Content generated successfully")
+
+            if output_filepath:
+                with open(output_filepath, "w") as file:
+                    file.write(self.response)
+                logger.info(f"Response content saved to {output_filepath}")
+
+            return self.response
+        except Exception as e:
+            logger.error(f"Error generating content: {str(e)}")
+            raise
 
 class DefaultPodcastifyTranscriptEngine(LLMBackend):
 	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None):
@@ -140,7 +318,8 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
 		return processed_matches
 
 	def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]:
-		content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)
+		content = self.content_generator.generate_qa_content(prompt, output_filepath=None)
+		# content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters)  # ideally in the future.
 
 		q_a_pairs = self.split_qa(content)
 		transcript = []

From fa83fc11967eb6b0a1b8aa6ced50e9205a07a5fa Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 18:06:10 +0200
Subject: [PATCH 12/49] support for local and ad other compat elements

---
 podcastfy/aiengines/llm/gemini_langchain.py |   5 +-
 podcastfy/client_v2.py                      | 253 ++++----------------
 2 files changed, 56 insertions(+), 202 deletions(-)

diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
index dd0d20ad..bb92bd22 100644
--- a/podcastfy/aiengines/llm/gemini_langchain.py
+++ b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -140,7 +140,7 @@ def __init__(
 
 
 class DefaultPodcastifyTranscriptEngine(LLMBackend):
-	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None):
+	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False):
 		"""
 		Initialize the DefaultPodcastifyTranscriptEngine.
 
@@ -149,6 +149,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] =
 			conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
 		"""
 		self.content_generator = ContentGenerator(api_key, conversation_config)
+		self.is_local = is_local
 
 	def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
 		"""
@@ -182,7 +183,7 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
 	def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]:
 		image_file_paths = [c.value for c in content if c.type == 'image_path']
 		text_content = "\n\n".join(c.value for c in content if c.type == 'text')
-		content = self.content_generator.generate_qa_content(text_content, image_file_paths) # ideally in the future we pass characters here
+		content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here
 
 		q_a_pairs = self.split_qa(content)
 		transcript = []
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index 0c79c5da..d8021439 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -13,6 +13,7 @@
 from podcastfy.content_parser.content_extractor import ContentExtractor
 from podcastfy.core.tts_configs import TTSConfig
 from podcastfy.utils.config import Config, load_config
+from podcastfy.utils.config_conversation import load_conversation_config
 from podcastfy.utils.logger import setup_logger
 
 logger = setup_logger(__name__)
@@ -21,6 +22,7 @@
 
 
 def create_characters(config: Dict[str, Any]) -> List[Character]:
+    # in the future, we should load this from the config file
     host = Character(
         name="Person1",
         role="Podcast host",
@@ -72,225 +74,76 @@ def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBa
 def process_content_v2(
     urls: Optional[List[str]] = None,
     transcript_file: Optional[str] = None,
-    tts_model: str = "openai",
+    tts_model: str = "openai",  # to be fixed, in case of characters, it should be a list of models
     generate_audio: bool = True,
     config: Optional[Config] = None,
     conversation_config: Optional[Dict[str, Any]] = None,
     image_paths: Optional[List[str]] = None,
     is_local: bool = False,
 ) -> Tuple[Optional[str], Podcast]:
-    if config is None:
-        config = load_config()
-    if urls is None:
-        urls = []
-    characters = create_characters(config.config)
-    tts_backends = create_tts_backends(config)
-    if transcript_file:
-        logger.info(f"Using transcript file: {transcript_file}")
-        transcript = Transcript.load(
-            transcript_file, {char.name: char for char in characters}
-        )
-        podcast = Podcast.from_transcript(transcript, tts_backends, characters)
-    else:
-        logger.info(f"Processing {len(urls)} links")
-        content_extractor = ContentExtractor(config.JINA_API_KEY)
-        content_generator = DefaultPodcastifyTranscriptEngine(
-            config.GEMINI_API_KEY, conversation_config, is_local=is_local
-        )
-
-        contents = [content_extractor.extract_content(url) for url in urls]
-        llm_contents = []
-        if contents:
-            llm_contents.append(LLMContent(value="\n\n".join(contents), type="text"))
-        if image_paths:
-            llm_contents.extend(
-                [LLMContent(value=image_path, type="image_path") for image_path in image_paths]
-            )
-
-
-
-        podcast = Podcast(
-            content=llm_contents,
-            llm_backend=content_generator,
-            tts_backends=tts_backends,
-            characters=characters,
-        )
-        
-
-    if generate_audio:
-        podcast.finalize()
-    else:
-        podcast.build_transcript()
-
-    return podcast
-
-
-@app.command()
-def main(
-    urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"),
-    file: typer.FileText = typer.Option(
-        None, "--file", "-f", help="File containing URLs, one per line"
-    ),
-    transcript: typer.FileText = typer.Option(
-        None, "--transcript", "-t", help="Path to a transcript file"
-    ),
-    tts_model: str = typer.Option(
-        None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"
-    ),
-    transcript_only: bool = typer.Option(
-        False, "--transcript-only", help="Generate only a transcript without audio"
-    ),
-    conversation_config: str = typer.Option(
-        None,
-        "--conversation-config",
-        "-cc",
-        help="Path to custom conversation configuration YAML file",
-    ),
-    output_dir: str = typer.Option(
-        "./output", "--output-dir", "-o", help="Directory to save output files"
-    ),
-):
-    """
-    Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
-    """
     try:
-        config = load_config()
-        main_config = config.config.get("main", {})
-        if tts_model is None:
-            tts_model = main_config.get("default_tts_model", "openai")
-
-        urls_list = urls or []
-        if file:
-            urls_list.extend([line.strip() for line in file if line.strip()])
-
-        if not urls_list and not transcript:
-            raise typer.BadParameter(
-                "No URLs or transcript provided. Use --url to specify URLs, --file to specify a file containing URLs, or --transcript for a transcript file."
+        if config is None:
+            config = load_config()
+        if urls is None:
+            urls = []
+            if config is None:
+                config = load_config()
+        # Load default conversation config
+        conv_config = load_conversation_config()
+
+        # Update with provided config if any
+        if conversation_config:
+            conv_config.configure(conversation_config)
+        characters = create_characters(conv_config.config_conversation)
+        tts_backends = create_tts_backends(config)
+        # filter out the tts backends that are not in the tts_model, temporary solution
+        tts_backends = [tts for tts in tts_backends if tts.name != tts_model]
+        if transcript_file:
+            logger.info(f"Using transcript file: {transcript_file}")
+            transcript = Transcript.load(
+                transcript_file, {char.name: char for char in characters}
             )
-
-        podcast = process_links(
-            urls_list,
-            transcript_file=transcript.name if transcript else None,
-            tts_model=tts_model,
-            generate_audio=not transcript_only,
-            config=config,
-            conversation_config=conversation_config,
-        )
-
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        if transcript_only:
-            transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt"
-            podcast.export_transcript(str(transcript_file))
-            typer.echo(f"Transcript generated successfully: {transcript_file}")
+            podcast = Podcast.from_transcript(transcript, tts_backends, characters)
         else:
-            audio_file = output_dir / f"podcast_{uuid.uuid4().hex}.mp3"
-            podcast.save(str(audio_file))
-            transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt"
-            podcast.export_transcript(str(transcript_file))
-            typer.echo(
-                f"Podcast generated successfully using {tts_model} TTS model: {audio_file}"
+            logger.info(f"Processing {len(urls)} links")
+            content_extractor = ContentExtractor()
+            content_generator = DefaultPodcastifyTranscriptEngine(
+                config.GEMINI_API_KEY, conversation_config, is_local=is_local
             )
-            typer.echo(f"Transcript saved to: {transcript_file}")
-
-    except Exception as e:
-        typer.echo(f"An error occurred: {str(e)}", err=True)
-        raise typer.Exit(code=1)
 
+            contents = [content_extractor.extract_content(url) for url in urls]
+            llm_contents = []
+            if contents:
+                llm_contents.append(LLMContent(value="\n\n".join(contents), type="text"))
+            if image_paths:
+                llm_contents.extend(
+                    [LLMContent(value=image_path, type="image_path") for image_path in image_paths]
+                )
 
-if __name__ == "__main__":
-    app()
-
-
-def generate_podcast(
-    urls: Optional[List[str]] = None,
-    url_file: Optional[str] = None,
-    transcript_file: Optional[str] = None,
-    tts_model: Optional[str] = None,
-    transcript_only: bool = False,
-    config: Optional[Dict[str, Any]] = None,
-    conversation_config: Optional[Dict[str, Any]] = None,
-) -> Podcast:
-    """
-    Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file.
-
-    Args:
-        urls (Optional[List[str]]): List of URLs to process.
-        url_file (Optional[str]): Path to a file containing URLs, one per line.
-        transcript_file (Optional[str]): Path to a transcript file.
-        tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs').
-        transcript_only (bool): Generate only a transcript without audio. Defaults to False.
-        config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
-        conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
-
-    Returns:
-        Podcast: An instance of the Podcast class representing the generated podcast.
-
-    Example:
-        >>> from podcastfy.client_v2 import generate_podcast
-        >>> podcast = generate_podcast(
-        ...     urls=['https://example.com/article1', 'https://example.com/article2'],
-        ...     tts_model='elevenlabs',
-        ...     config={
-        ...         'main': {
-        ...             'default_tts_model': 'elevenlabs'
-        ...         },
-        ...         'output_directories': {
-        ...             'audio': '/custom/path/to/audio',
-        ...             'transcripts': '/custom/path/to/transcripts'
-        ...         }
-        ...     },
-        ...     conversation_config={
-        ...         'word_count': 150,
-        ...         'conversation_style': ['informal', 'friendly'],
-        ...         'podcast_name': 'My Custom Podcast'
-        ...     }
-        ... )
-        >>> podcast.save('/path/to/output.mp3')
-        >>> podcast.export_transcript('/path/to/transcript.txt')
-    """
-    try:
-        default_config = load_config()
 
-        if config:
-            if isinstance(config, dict):
-                updated_config = Config()
-                updated_config.configure(**config)
-                default_config = updated_config
-            elif isinstance(config, Config):
-                default_config = config
-            else:
-                raise ValueError(
-                    "Config must be either a dictionary or a Config object"
-                )
 
-        main_config = default_config.config.get("main", {})
+            podcast = Podcast(
+                content=llm_contents,
+                llm_backend=content_generator,
+                tts_backends=tts_backends,
+                characters=characters,
+            )
 
-        if tts_model is None:
-            tts_model = main_config.get("default_tts_model", "openai")
 
-        urls_list = urls or []
-        if url_file:
-            with open(url_file, "r") as file:
-                urls_list.extend([line.strip() for line in file if line.strip()])
+        if generate_audio:
+            podcast.finalize()
 
-        if not urls_list and not transcript_file:
-            raise ValueError(
-                "No URLs or transcript provided. Please provide either 'urls', 'url_file', or 'transcript_file'."
+            # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
+            random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
+            audio_file = os.path.join(
+                config.get("output_directories")["audio"], random_filename
             )
-
-        podcast = process_links(
-            urls_list,
-            transcript_file=transcript_file,
-            tts_model=tts_model,
-            generate_audio=not transcript_only,
-            config=default_config,
-            conversation_config=conversation_config,
-        )
+            podcast.save(filepath=audio_file)
+            return audio_file
+        else:
+            podcast.build_transcript()
 
         return podcast
-
     except Exception as e:
-        logger.error(f"An error occurred: {str(e)}")
+        logger.error(f"An error occurred in the process_content function: {str(e)}")
         raise

From 08cccc1647b6e34274f5c479c5143422a1357bea Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 18:08:38 +0200
Subject: [PATCH 13/49] ending message

---
 podcastfy/content_generator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
index 01502aa2..9ff9f0af 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/content_generator.py
@@ -71,6 +71,7 @@ def __init__(
         self.content_generator_config = self.config.get("content_generator", {})
 
         self.config_conversation = load_conversation_config(conversation_config)
+        self.ending_message = self.config_conversation.get('text_to_speech')['ending_message']
 
     def __compose_prompt(self, num_images: int):
         """

From 0eed1d4f440c17cf09816e2ef8a998ff8fd66a8a Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 18:17:00 +0200
Subject: [PATCH 14/49] two fixes

---
 podcastfy/aiengines/tts/tts_backends.py | 2 +-
 podcastfy/client_v2.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 1e4d4125..930b7224 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -46,7 +46,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat
             model=config.extra_args.get('model', self.get_default_config().get('model', 'default'))
         )
         with open(output_path, "wb") as out:
-            for chunk in content:
+            async for chunk in content:
                 if chunk:
                     out.write(chunk)
 
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index d8021439..a9c58733 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -97,7 +97,7 @@ def process_content_v2(
         characters = create_characters(conv_config.config_conversation)
         tts_backends = create_tts_backends(config)
         # filter out the tts backends that are not in the tts_model, temporary solution
-        tts_backends = [tts for tts in tts_backends if tts.name != tts_model]
+        tts_backends = [tts for tts in tts_backends if tts.name == tts_model]
         if transcript_file:
             logger.info(f"Using transcript file: {transcript_file}")
             transcript = Transcript.load(

From cd1141c565526c4da82f0f97286693159436ba62 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 18:19:57 +0200
Subject: [PATCH 15/49] fix threads

---
 podcastfy/core/audio.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index 2e57f394..f619bc61 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -83,7 +83,8 @@ def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
 
 
         with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
-            return list(executor.map(process_segment, transcript.segments))
+            return list(executor.map(process_segment,
+                                     ((segment, i) for i, segment in enumerate(transcript.segments))))
 
     def create_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
         if self.has_async_backend:

From 38db311e104135a7646be9d3423aa27172b74647 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 18:59:07 +0200
Subject: [PATCH 16/49] fix incorrect default path for configs

---
 podcastfy/aiengines/tts/base.py         | 2 +-
 podcastfy/aiengines/tts/tts_backends.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
index 7b88c290..8a8ded3c 100644
--- a/podcastfy/aiengines/tts/base.py
+++ b/podcastfy/aiengines/tts/base.py
@@ -51,7 +51,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat
 class TTSConfigMixin:
     """Mixin class to manage TTS external configurations."""
 
-    def __init__(self, config_file: str = 'podcastfy/config.yaml', name: str = "") -> None:
+    def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None:
         # TODO: probably bad config files for final client
         self.name = name
         self.config_file = config_file
diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 930b7224..1d0f7700 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -18,7 +18,7 @@
 class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin):
     name: str = "elevenlabs"
 
-    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'):
+    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'):
         # TODO: not the right path for final client
         TTSConfigMixin.__init__(self, config_file, name=self.name)
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
@@ -76,7 +76,7 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
 class EdgeTTS(AsyncTTSBackend, TTSConfigMixin):
     name: str = "edge"
 
-    def __init__(self, config_file: str = 'podcastfy/config.yaml'):
+    def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml'):
         TTSConfigMixin.__init__(self, config_file, name=self.name)
 
     async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:

From 54e046b2062fb2fe64c3d0980f7004a107394f8c Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 22:14:52 +0200
Subject: [PATCH 17/49] better naming and fix an import

---
 podcastfy/aiengines/llm/base.py               |  4 ++--
 podcastfy/aiengines/llm/gemini_langchain.py   |  4 ++--
 podcastfy/aiengines/tts/base.py               |  4 +++-
 podcastfy/client_v2.py                        | 21 ++++++++++++-------
 podcastfy/core/{llm_content.py => content.py} |  2 +-
 podcastfy/core/podcast.py                     | 18 ++++++++--------
 6 files changed, 31 insertions(+), 22 deletions(-)
 rename podcastfy/core/{llm_content.py => content.py} (83%)

diff --git a/podcastfy/aiengines/llm/base.py b/podcastfy/aiengines/llm/base.py
index 7223bb2d..071f79fe 100644
--- a/podcastfy/aiengines/llm/base.py
+++ b/podcastfy/aiengines/llm/base.py
@@ -2,7 +2,7 @@
 from typing import List, Tuple
 
 from podcastfy.core.character import Character
-from podcastfy.core.llm_content import LLMContent
+from podcastfy.core.content import Content
 
 
 class LLMBackend(ABC):
@@ -10,7 +10,7 @@ class LLMBackend(ABC):
     # TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings)
 
     @abstractmethod
-    def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]:
+    def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
         """
         Generate text based on a given prompt.
 
diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
index bb92bd22..ebd09e1f 100644
--- a/podcastfy/aiengines/llm/gemini_langchain.py
+++ b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -19,7 +19,7 @@
 from podcastfy.content_generator import ContentGenerator
 from podcastfy.core.character import Character
 from podcastfy.aiengines.llm.base import LLMBackend
-from podcastfy.core.llm_content import LLMContent
+from podcastfy.core.content import Content
 from podcastfy.utils.config_conversation import load_conversation_config
 from podcastfy.utils.config import load_config
 import logging
@@ -180,7 +180,7 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
 		]
 		return processed_matches
 
-	def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]:
+	def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]:
 		image_file_paths = [c.value for c in content if c.type == 'image_path']
 		text_content = "\n\n".join(c.value for c in content if c.type == 'text')
 		content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here
diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
index 8a8ded3c..bcda17a3 100644
--- a/podcastfy/aiengines/tts/base.py
+++ b/podcastfy/aiengines/tts/base.py
@@ -1,12 +1,14 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Union
 
 import yaml
 
 from podcastfy.core.character import Character
 from podcastfy.core.tts_configs import TTSConfig
 
+TTSBackend = Union["SyncTTSBackend", "AsyncTTSBackend"]
+
 
 class SyncTTSBackend(ABC):
     """Protocol for synchronous Text-to-Speech backends."""
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index a9c58733..c5cb62d6 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -5,9 +5,10 @@
 from typing import List, Optional, Dict, Any, Union, Tuple
 
 from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine
+from podcastfy.aiengines.tts.base import TTSBackend
 from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
 from podcastfy.core.character import Character
-from podcastfy.core.llm_content import LLMContent
+from podcastfy.core.content import Content
 from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend
 from podcastfy.core.transcript import Transcript
 from podcastfy.content_parser.content_extractor import ContentExtractor
@@ -62,7 +63,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]:
     return [host, guest]
 
 
-def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]:
+def create_tts_backends(config: Config) -> List[TTSBackend]:
     return [
         OpenAITTS(api_key=config.OPENAI_API_KEY),
         ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY),
@@ -95,9 +96,7 @@ def process_content_v2(
         if conversation_config:
             conv_config.configure(conversation_config)
         characters = create_characters(conv_config.config_conversation)
-        tts_backends = create_tts_backends(config)
-        # filter out the tts backends that are not in the tts_model, temporary solution
-        tts_backends = [tts for tts in tts_backends if tts.name == tts_model]
+        tts_backends = obtain_tts_backend(config, tts_model)
         if transcript_file:
             logger.info(f"Using transcript file: {transcript_file}")
             transcript = Transcript.load(
@@ -114,10 +113,10 @@ def process_content_v2(
             contents = [content_extractor.extract_content(url) for url in urls]
             llm_contents = []
             if contents:
-                llm_contents.append(LLMContent(value="\n\n".join(contents), type="text"))
+                llm_contents.append(Content(value="\n\n".join(contents), type="text"))
             if image_paths:
                 llm_contents.extend(
-                    [LLMContent(value=image_path, type="image_path") for image_path in image_paths]
+                    [Content(value=image_path, type="image_path") for image_path in image_paths]
                 )
 
 
@@ -147,3 +146,11 @@ def process_content_v2(
     except Exception as e:
         logger.error(f"An error occurred in the process_content function: {str(e)}")
         raise
+
+
+def obtain_tts_backend(config, tts_model):
+    # temporary solution
+    tts_backends = create_tts_backends(config)
+    # filter out the tts backends that are not in the tts_model, temporary solution
+    tts_backends = [tts for tts in tts_backends if tts.name == tts_model]
+    return tts_backends
diff --git a/podcastfy/core/llm_content.py b/podcastfy/core/content.py
similarity index 83%
rename from podcastfy/core/llm_content.py
rename to podcastfy/core/content.py
index d9ecfe54..3fc6d704 100644
--- a/podcastfy/core/llm_content.py
+++ b/podcastfy/core/content.py
@@ -4,6 +4,6 @@
 
 # we can do much better here, but for now, let's keep it simple
 
-class LLMContent(BaseModel):
+class Content(BaseModel):
     value: Any
     type: str
\ No newline at end of file
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index f6300421..7a60e8fb 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -8,10 +8,10 @@
 from contextlib import contextmanager
 
 from podcastfy.aiengines.llm.base import LLMBackend
-from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend
+from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend, TTSBackend
 from podcastfy.core.audio import PodcastsAudioSegment, AudioManager
 from podcastfy.core.character import Character
-from podcastfy.core.llm_content import LLMContent
+from podcastfy.core.content import Content
 from podcastfy.core.transcript import TranscriptSegment, Transcript
 from podcastfy.core.tts_configs import TTSConfig
 
@@ -55,8 +55,8 @@ def wrapper(self, *args, **kwargs):
 class Podcast:
     """Main class for podcast creation and management."""
 
-    def __init__(self, content: List[LLMContent], llm_backend: LLMBackend,
-                 tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], audio_temp_dir: Optional[Union[str, Path]] = None,
+    def __init__(self, content: List[Content], llm_backend: LLMBackend,
+                 tts_backends: List[TTSBackend], audio_temp_dir: Optional[Union[str, Path]] = None,
                  characters: Optional[List[Character]] = None,
                  default_tts_n_jobs: int = 1) -> None:
         """
@@ -65,7 +65,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend,
         Args:
             content (str): The raw content to be processed into a podcast.
             llm_backend (LLMBackend): The language model backend for generating the transcript.
-            tts_backends (List[Union[SyncTTSBackend, AsyncTTSBackend]]): List of available TTS backends.
+            tts_backends (List[TTSBackend]): List of available TTS backends.
             audio_temp_dir (Optional[str]): Path to a temporary directory for audio files. If None, a temporary
                 directory will be created.
             characters (List[Character]): List of characters participating in the podcast.
@@ -77,7 +77,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend,
         """
         self.content = content
         self.llm_backend = llm_backend
-        self.tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]] = {backend.name: backend for backend in tts_backends}
+        self.tts_backends: Dict[str, TTSBackend] = {backend.name: backend for backend in tts_backends}
         self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])}
         self.default_tts_n_jobs = default_tts_n_jobs
         self.state = PodcastState.INITIALIZED
@@ -95,7 +95,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend,
         # Initialize attributes with null values
         self.transcript: Optional[Transcript] = None
         self.audio_segments: List[PodcastsAudioSegment] = []
-        self.audio: Optional[PydubAudioSegment] = None
+        self.audio: Optional[AudioSegment] = None
 
         # Define the sequence of methods to be called for each stage
         self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = {
@@ -268,7 +268,7 @@ def __init__(self, name: str):
             self.name = name
 
         def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
-            audio = PydubAudioSegment.silent(duration=1000)
+            audio = AudioSegment.silent(duration=1000)
             audio.export(str(output_path), format="mp3")
             return output_path
 
@@ -338,7 +338,7 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
 
     # Add a new audio segment (auto_finalize is True by default)
     with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
-        PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3")
+        AudioSegment.silent(duration=500).export(temp_file.name, format="mp3")
 
     with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT):
         new_segment = PodcastsAudioSegment(Path(temp_file.name), 500,

From a33e2f879c0aaa53d7f6b8afdc75df0a88387c88 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 22:15:09 +0200
Subject: [PATCH 18/49] fix argument type

---
 podcastfy/aiengines/tts/tts_backends.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 1d0f7700..37cbf65c 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -82,7 +82,7 @@ def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml'):
     async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
         config = self.tts_config_for_character(character)
         communicate = edge_tts.Communicate(text, config.voice)
-        await communicate.save(output_path)
+        await communicate.save(str(output_path))
 
 
 

From afbe769e8198d619e2cdef52708526a45ab1be53 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 22:15:30 +0200
Subject: [PATCH 19/49] more compat

---
 podcastfy/client_v2.py         | 4 ++--
 podcastfy/content_generator.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index c5cb62d6..088b8d20 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -138,11 +138,11 @@ def process_content_v2(
                 config.get("output_directories")["audio"], random_filename
             )
             podcast.save(filepath=audio_file)
-            return audio_file
+            return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
         else:
             podcast.build_transcript()
 
-        return podcast
+        return None # note: should return the podcast object instead, but for the sake of the tests, we return None
     except Exception as e:
         logger.error(f"An error occurred in the process_content function: {str(e)}")
         raise
diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
index 9ff9f0af..5f3c190f 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/content_generator.py
@@ -71,7 +71,7 @@ def __init__(
         self.content_generator_config = self.config.get("content_generator", {})
 
         self.config_conversation = load_conversation_config(conversation_config)
-        self.ending_message = self.config_conversation.get('text_to_speech')['ending_message']
+        self.ending_message = self.config_conversation.get('text_to_speech').get('ending_message','')
 
     def __compose_prompt(self, num_images: int):
         """

From 267a3590f81f5a26cf6835c57d258f3d15b31bb1 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 22:15:43 +0200
Subject: [PATCH 20/49] add interogation

---
 podcastfy/core/transcript.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index 952fa2be..e2baec2d 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -56,6 +56,7 @@ def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]:
     @classmethod
     def load(cls, filepath: str, characters: Dict[str, Character]) -> 'Transcript':
         """Load a transcript from a JSON file."""
+        # There are a loss of characters informations when loading a transcript, is it acceptable?
         with open(filepath, 'r') as f:
             content = f.read()
 

From 6084e41f6e6b3672567c36128824722be41eeddf Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 22:15:54 +0200
Subject: [PATCH 21/49] fix test

---
 tests/test_generate_podcast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py
index b9699e6f..811d848f 100644
--- a/tests/test_generate_podcast.py
+++ b/tests/test_generate_podcast.py
@@ -77,7 +77,7 @@ def test_generate_podcast_from_transcript_file(sample_config):
 	# First, generate a transcript
 	transcript_file = os.path.join(sample_config.get('output_directories', {}).get('transcripts'), 'test_transcript.txt')
 	with open(transcript_file, 'w') as f:
-		f.write("<Person1>Joe Biden and the US Politics<Person1><Person2>Joe Biden is the current president of the United States of America<Person2>")
+		f.write("<Person1>Joe Biden and the US Politics</Person1><Person2>Joe Biden is the current president of the United States of America</Person2>")
 	
 	# Now use this transcript to generate a podcast
 	audio_file = generate_podcast(

From 91b726b92c96d02a437e8851dcc22b6772d088bd Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 23:14:34 +0200
Subject: [PATCH 22/49] add todo temp

---
 must_do_before_merge.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 must_do_before_merge.txt

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
new file mode 100644
index 00000000..b7f12b1f
--- /dev/null
+++ b/must_do_before_merge.txt
@@ -0,0 +1,5 @@
+- one test or two on the Podcast Class
+- delete client_v2 and merge it with client
+- check that all config options are taken
+- remove the excessive prints
+- ... ?

From 5e633aa9a2cd1893bf924e45c902ff823e48aec5 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Wed, 16 Oct 2024 23:15:16 +0200
Subject: [PATCH 23/49] add todo temp

---
 must_do_before_merge.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
index b7f12b1f..f845252a 100644
--- a/must_do_before_merge.txt
+++ b/must_do_before_merge.txt
@@ -2,4 +2,5 @@
 - delete client_v2 and merge it with client
 - check that all config options are taken
 - remove the excessive prints
+- check that all tts work
 - ... ?

From 96e7db4d58b2afea7f9eb1e64141485d0cac57e3 Mon Sep 17 00:00:00 2001
From: Tharsis Souza <souza.tharsis@gmail.com>
Date: Wed, 16 Oct 2024 18:30:24 -0300
Subject: [PATCH 24/49] Update must_do_before_merge.txt

---
 must_do_before_merge.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
index f845252a..24614f0b 100644
--- a/must_do_before_merge.txt
+++ b/must_do_before_merge.txt
@@ -4,3 +4,5 @@
 - remove the excessive prints
 - check that all tts work
 - ... ?
+- 100% of current pytest unit tests pass
+- 100% of of CLI case scenarios from usage/cli.md

From 9703997f0c18ade69fd095f6bbed472b12446940 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 07:35:17 +0200
Subject: [PATCH 25/49] tests the podcast class

---
 must_do_before_merge.txt     |   2 +-
 podcastfy/core/podcast.py    |   4 +-
 podcastfy/core/transcript.py |   2 +-
 tests/test_core_api.py       | 168 +++++++++++++++++++++++++++++++++++
 4 files changed, 172 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_core_api.py

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
index 24614f0b..38e14f3f 100644
--- a/must_do_before_merge.txt
+++ b/must_do_before_merge.txt
@@ -4,5 +4,5 @@
 - remove the excessive prints
 - check that all tts work
 - ... ?
-- 100% of current pytest unit tests pass
+- 100% of current pytest unit tests pass  [x] (except for test_generate_podcast_with_custom_config, exhausted credits)
 - 100% of of CLI case scenarios from usage/cli.md
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 7a60e8fb..6946e138 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -226,7 +226,7 @@ def save(self, filepath: str) -> None:
 
     def export_transcript(self, filepath: str, format_: str = "plaintext") -> None:
         """Save the podcast transcript to a file."""
-        if self.state < PodcastState.TRANSCRIPT_BUILT:
+        if self.state.value < PodcastState.TRANSCRIPT_BUILT.value:
             raise ValueError("Transcript can only be saved after it is built")
 
         if self.transcript:
@@ -236,7 +236,7 @@ def export_transcript(self, filepath: str, format_: str = "plaintext") -> None:
 
     def dump_transcript(self, filepath: str) -> None:
         """Dump the podcast transcript to a JSON file."""
-        if self.state < PodcastState.TRANSCRIPT_BUILT:
+        if self.state.value < PodcastState.TRANSCRIPT_BUILT.value:
             raise ValueError("Transcript can only be dumped after it is built")
 
         if self.transcript:
diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index e2baec2d..7cbac381 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -28,7 +28,7 @@ def from_dict(cls, data: Dict[str, Any], characters: Dict[str, Character]) -> 'T
 
 
 class Transcript:
-    def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]) -> None:
+    def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any] = {}) -> None:
         self.segments = segments
         self.metadata = metadata
 
diff --git a/tests/test_core_api.py b/tests/test_core_api.py
new file mode 100644
index 00000000..fba450d1
--- /dev/null
+++ b/tests/test_core_api.py
@@ -0,0 +1,168 @@
+"""Tests for the core API of the podcastfy package. Not e2e tests as DummyTTSBackend is used to simulate the TTS backend and DummyLLMBackend is used to simulate the LLM backend."""
+import pytest
+from pathlib import Path
+from pydub import AudioSegment
+
+from podcastfy.core.content import Content
+from podcastfy.core.podcast import Podcast, PodcastState
+from podcastfy.aiengines.llm.base import LLMBackend
+from podcastfy.aiengines.tts.base import SyncTTSBackend
+from podcastfy.core.character import Character
+from podcastfy.core.tts_configs import TTSConfig
+from podcastfy.core.transcript import TranscriptSegment, Transcript
+
+
+class DummyLLMBackend(LLMBackend):
+    def generate_transcript(self, content, characters):
+        return [
+            (characters[0], "Welcome to our podcast!"),
+            (characters[1], "Thanks for having me!")
+        ]
+
+
+class DummyTTSBackend(SyncTTSBackend):
+    def __init__(self, name: str):
+        self.name = name
+
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+        audio = AudioSegment.silent(duration=1000)
+        audio.export(str(output_path), format="mp3")
+        return output_path
+
+
+@pytest.fixture
+def tts_backends():
+    return [DummyTTSBackend("openai"), DummyTTSBackend("elevenlabs")]
+
+
+@pytest.fixture
+def characters():
+    host = Character(
+        name="Host",
+        role="Podcast host",
+        tts_configs={
+            "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}),
+            "elevenlabs": TTSConfig(voice="Rachel", backend="elevenlabs", extra_args={"stability": 0.5})
+        },
+        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly."
+    )
+
+    guest = Character(
+        name="Guest",
+        role="Expert guest",
+        tts_configs={
+            "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}),
+            "elevenlabs": TTSConfig(voice="Antoni", backend="elevenlabs", extra_args={"stability": 0.8})
+        },
+        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner."
+    )
+
+    return [host, guest]
+
+
+@pytest.fixture
+def podcast(tts_backends, characters):
+    return Podcast(
+        content=[Content(value="This is a sample content for our podcast.", type="text")],
+        llm_backend=DummyLLMBackend(),
+        tts_backends=tts_backends,
+        characters=characters,
+    )
+
+
+def test_podcast_initialization(podcast):
+    assert podcast.state == PodcastState.INITIALIZED
+    assert podcast.transcript is None
+    assert podcast.audio_segments == []
+    assert podcast.audio is None
+
+
+def test_build_transcript(podcast):
+    podcast.build_transcript()
+    assert podcast.state == PodcastState.TRANSCRIPT_BUILT
+    assert isinstance(podcast.transcript, Transcript)
+    assert len(podcast.transcript.segments) == 2
+
+
+def test_build_audio_segments(podcast):
+    podcast.build_transcript()
+    podcast.build_audio_segments()
+    assert podcast.state == PodcastState.AUDIO_SEGMENTS_BUILT
+    assert len(podcast.audio_segments) == 2
+
+
+def test_stitch_audio_segments(podcast):
+    podcast.build_transcript()
+    podcast.build_audio_segments()
+    podcast.stitch_audio_segments()
+    assert podcast.state == PodcastState.STITCHED
+    assert isinstance(podcast.audio, AudioSegment)
+
+
+def test_finalize(podcast):
+    podcast.finalize()
+    assert podcast.state == PodcastState.STITCHED
+    assert isinstance(podcast.transcript, Transcript)
+    assert len(podcast.audio_segments) > 0
+    assert isinstance(podcast.audio, AudioSegment)
+
+
+def test_save(podcast, tmp_path):
+    podcast.finalize()
+    output_file = tmp_path / "test_podcast.mp3"
+    podcast.save(str(output_file))
+    assert output_file.exists()
+
+
+def test_export_transcript(podcast, tmp_path):
+    podcast.finalize()
+    output_file = tmp_path / "test_transcript.txt"
+    podcast.export_transcript(str(output_file), format_="plaintext")
+    assert output_file.exists()
+
+
+def test_rework(podcast):
+    podcast.finalize()
+
+    with podcast.rework(PodcastState.TRANSCRIPT_BUILT):
+        assert podcast.state == PodcastState.TRANSCRIPT_BUILT
+        podcast.transcript.segments.append(
+            TranscriptSegment("This is a new segment", podcast.characters["Host"]))
+
+    assert podcast.state == PodcastState.STITCHED
+    assert len(podcast.transcript.segments) == 3
+
+
+def test_from_transcript(tts_backends, characters):
+    pre_existing_transcript = [
+        ("Host", "Welcome to our podcast created from a pre-existing transcript!"),
+        ("Guest", "Thank you for having me. I'm excited to be here.")
+    ]
+
+    podcast = Podcast.from_transcript(
+        transcript=Transcript([
+            TranscriptSegment(text, characters[0] if speaker == "Host" else characters[1])
+            for speaker, text in pre_existing_transcript
+        ]),
+        tts_backends=tts_backends,
+        characters=characters
+    )
+
+    assert podcast.state == PodcastState.TRANSCRIPT_BUILT
+    assert len(podcast.transcript.segments) == 2
+
+    podcast.finalize()
+    assert podcast.state == PodcastState.STITCHED
+
+
+def test_load_transcript(tts_backends, characters, tmp_path):
+    # Create a dummy transcript file
+    transcript_file = tmp_path / "test_transcript.json"
+    Transcript([
+        TranscriptSegment("Welcome to our podcast!", characters[0]),
+        TranscriptSegment("Thank you for having me!", characters[1])
+    ]).dump(str(transcript_file))
+
+    podcast = Podcast.load_transcript(str(transcript_file), tts_backends, characters)
+    assert podcast.state == PodcastState.TRANSCRIPT_BUILT
+    assert len(podcast.transcript.segments) == 2
\ No newline at end of file

From 317c7311edbe967cefba7a9092c47e8b6fe2b8aa Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 16:29:39 +0200
Subject: [PATCH 26/49] add compat with transcript saving

---
 podcastfy/client_v2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index 088b8d20..d6c28fae 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -134,9 +134,11 @@ def process_content_v2(
 
             # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
             random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
+            directories = config.get("output_directories")
             audio_file = os.path.join(
-                config.get("output_directories")["audio"], random_filename
+                directories["audio"], random_filename
             )
+            podcast.transcript.export(directories["transcripts"])
             podcast.save(filepath=audio_file)
             return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
         else:

From 8fb7aa3e1f4673a9729be49c7dfddfdf20d54498 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 19:15:20 +0200
Subject: [PATCH 27/49] fix bug and signature of TTS

---
 podcastfy/aiengines/tts/base.py | 4 ++--
 podcastfy/core/audio.py         | 7 ++++---
 tests/test_transcript.py        | 0
 3 files changed, 6 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_transcript.py

diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
index bcda17a3..f251a8b2 100644
--- a/podcastfy/aiengines/tts/base.py
+++ b/podcastfy/aiengines/tts/base.py
@@ -16,7 +16,7 @@ class SyncTTSBackend(ABC):
     name: str
 
     @abstractmethod
-    def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+    def text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
         """
         Convert text to speech synchronously.
 
@@ -37,7 +37,7 @@ class AsyncTTSBackend(ABC):
     name: str
 
     @abstractmethod
-    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path:
+    async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None:
         """
         Convert text to speech asynchronously.
 
diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index f619bc61..663f9d59 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -74,12 +74,13 @@ def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAud
         def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
             segment, index = segment_tuple
             tts_backend = self.get_tts_backend(segment)
-            audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech(
+            filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3"
+            cast(SyncTTSBackend, tts_backend).text_to_speech(
                 segment.text,
                 segment.speaker,
-                Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3"
+                filepath
             )
-            return PodcastsAudioSegment(audio_file, segment)
+            return PodcastsAudioSegment(filepath, segment)
 
 
         with ThreadPoolExecutor(max_workers=self.n_jobs) as executor:
diff --git a/tests/test_transcript.py b/tests/test_transcript.py
new file mode 100644
index 00000000..e69de29b

From 9dcfeda565185ae350933e4370458a170f713b3f Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 19:15:57 +0200
Subject: [PATCH 28/49] clean markup at TranscriptSegment place

---
 podcastfy/core/transcript.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index 7cbac381..eec29bf6 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -5,9 +5,39 @@
 from podcastfy.core.character import Character
 
 
+def clean_markups(input_text: str) -> str:
+    """
+    Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
+
+    Args:
+        input_text (str): The input text containing TSS markup tags.
+
+    Returns:
+        str: Cleaned text with unsupported TSS markup tags removed.
+    """
+    # List of SSML tags supported by both OpenAI and ElevenLabs
+    supported_tags = [
+        'speak', 'lang', 'p', 'phoneme',
+        's', 'say-as', 'sub'
+    ]
+    # Append additional tags to the supported tags list
+    # Create a pattern that matches any tag not in the supported list
+    pattern = r'</?(?!(?:' + '|'.join(supported_tags) + r')\b)[^>]+>'
+
+    # Remove unsupported tags
+    cleaned_text = re.sub(pattern, '', input_text)
+
+    # Remove any leftover empty lines
+    cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
+    cleaned_text = cleaned_text.replace('(scratchpad)', '')
+    return cleaned_text
+
+
 class TranscriptSegment:
-    def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None) -> None:
-        self.text = text
+    def __init__(self, text: str, speaker: Character,
+                 tts_args: Optional[Dict[str, Any]] = None,
+                 auto_clean_markup=True) -> None:
+        self.text = clean_markups(text) if auto_clean_markup else text
         self.speaker = speaker
         self.tts_args = tts_args or {}
 

From 5573adcbd5882aed282d1a14f7f674657780aad2 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 19:17:13 +0200
Subject: [PATCH 29/49] save transcript automatically for compat sake

---
 podcastfy/client_v2.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index d6c28fae..7f598ebc 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -133,12 +133,14 @@ def process_content_v2(
             podcast.finalize()
 
             # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
-            random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
+            random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
+            random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
+            random_filename_transcript = f"{random_filename_no_suffix}.txt"
             directories = config.get("output_directories")
             audio_file = os.path.join(
-                directories["audio"], random_filename
+                directories["audio"], random_filename_mp3
             )
-            podcast.transcript.export(directories["transcripts"])
+            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
             podcast.save(filepath=audio_file)
             return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
         else:

From 7454ea33465e34f0a9090a3a5691a255f1339905 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 19:17:36 +0200
Subject: [PATCH 30/49] better print

---
 podcastfy/core/podcast.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 6946e138..2e0c40ff 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -34,7 +34,8 @@ def probably_same_func(method, func):
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         current_method = self._next_stage_methods[self.state]
-        print(f"Executing {func.__name__} in state {self.state.name}")
+        print(f"Current state: {self.state.name}")
+        print(f"Executing: {func.__name__}")
         if not probably_same_func(current_method, func) and not self._reworking:
             print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.")
             raise Exception(f"Cannot execute {func.__name__} in current state {self.state.name}")
@@ -43,7 +44,7 @@ def wrapper(self, *args, **kwargs):
             result = func(self, *args, **kwargs)
             next_state = PodcastState(self.state.value + 1)
             self.state = next_state or self.state
-            print(f"Transitioned to state {self.state.name}")
+            print(f"Done! Current State: {self.state.name}")
             return result
         except Exception as e:
             print(f"Error in {func.__name__}: {str(e)}")
@@ -178,6 +179,9 @@ def build_transcript(self) -> None:
                 if speaker.name in self.characters:
                     tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {}))
                     segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config))
+            else:
+                print(f"Invalid segment: {segment}")
+                continue
             # If the segment doesn't match the expected format, we'll skip it
 
         self.transcript = Transcript(segments, {"source": "Generated content"})

From 034b19343fe7b246ceaf8d8da9d2b6c63f04fdff Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 19:38:46 +0200
Subject: [PATCH 31/49] tests, but one fails

---
 tests/test_transcript.py | 87 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/tests/test_transcript.py b/tests/test_transcript.py
index e69de29b..cb01e395 100644
--- a/tests/test_transcript.py
+++ b/tests/test_transcript.py
@@ -0,0 +1,87 @@
+import pytest
+from podcastfy.core.transcript import clean_markups, TranscriptSegment, Transcript, Character
+from unittest.mock import patch, mock_open
+
+@pytest.fixture
+def characters():
+    character1 = Character("Person1", "John Doe", {})
+    character2 = Character("Person2", "Jane Smith", {})
+    return {"Person1": character1, "Person2": character2}
+
+def test_clean_markups():
+    input_text = "<speak>Hello <unsupported>World</unsupported><prosody rate='slow'>. This is a test</prosody></speak>"
+    expected_output = "<speak>Hello World. This is a test</speak>"
+    assert clean_markups(input_text) == expected_output
+
+def test_clean_markups_with_scratchpad():
+    input_text = "Hello (scratchpad)<prosody pitch='high'>World</prosody>"
+    expected_output = "Hello World"
+    assert clean_markups(input_text) == expected_output
+
+def test_transcript_segment_init(characters):
+    segment = TranscriptSegment("Hello <unsupported>World </unsupported><prosody volume='loud'>Test</prosody>", characters["Person1"])
+    assert segment.text == "Hello World Test"
+    assert segment.speaker == characters["Person1"]
+
+def test_transcript_segment_to_dict(characters):
+    segment = TranscriptSegment("Hello World", characters["Person1"], {"voice_id": "test_voice"})
+    expected_dict = {
+        "text": "Hello World",
+        "speaker": "Person1",
+        "tts_args": {"voice_id": "test_voice"}
+    }
+    assert segment.to_dict() == expected_dict
+
+def test_transcript_segment_from_dict(characters):
+    data = {
+        "text": "Hello World",
+        "speaker": "Person1",
+        "tts_args": {"voice_id": "test_voice"}
+    }
+    segment = TranscriptSegment.from_dict(data, characters)
+    assert segment.text == "Hello World"
+    assert segment.speaker == characters["Person1"]
+    assert segment.tts_args == {"voice_id": "test_voice"}
+
+def test_transcript_init(characters):
+    segments = [
+        TranscriptSegment("Hello", characters["Person1"]),
+        TranscriptSegment("Hi there", characters["Person2"])
+    ]
+    transcript = Transcript(segments, {"title": "Test Transcript"})
+    assert len(transcript.segments) == 2
+    assert transcript.metadata == {"title": "Test Transcript"}
+
+def test_transcript_to_dict(characters):
+    segments = [
+        TranscriptSegment("Hello", characters["Person1"]),
+        TranscriptSegment("Hi there", characters["Person2"])
+    ]
+    transcript = Transcript(segments, {"title": "Test Transcript"})
+    expected_dict = {
+        "segments": [
+            {"text": "Hello", "speaker": "Person1", "tts_args": {}},
+            {"text": "Hi there", "speaker": "Person2", "tts_args": {}}
+        ],
+        "metadata": {"title": "Test Transcript"}
+    }
+    assert transcript.to_dict() == expected_dict
+
+@pytest.mark.parametrize("file_content,expected_segments", [
+    ('{"segments": [{"text": "Hello", "speaker": "Person1", "tts_args": {}}], "metadata": {}}', 1),
+    ('<Person1>Hello</Person1>\n<Person2>Hi there</Person2>', 2)
+])
+def test_transcript_load(file_content, expected_segments, characters):
+    with patch('builtins.open', new_callable=mock_open, read_data=file_content):
+        transcript = Transcript.load("fake_path.json", characters)
+        assert len(transcript.segments) == expected_segments
+        assert transcript.segments[0].speaker == characters["Person1"]
+
+def test_transcript_str(characters):
+    segments = [
+        TranscriptSegment("Hello", characters["Person1"]),
+        TranscriptSegment("Hi there", characters["Person2"])
+    ]
+    transcript = Transcript(segments, {"title": "Test Transcript"})
+    expected_str = "Metadata:\ntitle: Test Transcript\n\nTranscript:\nPerson1: Hello\nPerson2: Hi there"
+    assert str(transcript) == expected_str
\ No newline at end of file

From 0aa7070ec27d9b1d13d39b91907de8b7105a1c06 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 20:42:50 +0200
Subject: [PATCH 32/49] fix regex ?

---
 podcastfy/core/transcript.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index eec29bf6..cf1c4c66 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -22,7 +22,7 @@ def clean_markups(input_text: str) -> str:
     ]
     # Append additional tags to the supported tags list
     # Create a pattern that matches any tag not in the supported list
-    pattern = r'</?(?!(?:' + '|'.join(supported_tags) + r')\b)[^>]+>'
+    pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>'
 
     # Remove unsupported tags
     cleaned_text = re.sub(pattern, '', input_text)

From b7fe017c622931d6ee98ce328de737c1d8cac34d Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 20:51:22 +0200
Subject: [PATCH 33/49] private static method

---
 podcastfy/core/transcript.py | 57 ++++++++++++++++++------------------
 tests/test_transcript.py     |  6 ++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index cf1c4c66..b8860a22 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -5,42 +5,43 @@
 from podcastfy.core.character import Character
 
 
-def clean_markups(input_text: str) -> str:
-    """
-    Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
-
-    Args:
-        input_text (str): The input text containing TSS markup tags.
-
-    Returns:
-        str: Cleaned text with unsupported TSS markup tags removed.
-    """
-    # List of SSML tags supported by both OpenAI and ElevenLabs
-    supported_tags = [
-        'speak', 'lang', 'p', 'phoneme',
-        's', 'say-as', 'sub'
-    ]
-    # Append additional tags to the supported tags list
-    # Create a pattern that matches any tag not in the supported list
-    pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>'
-
-    # Remove unsupported tags
-    cleaned_text = re.sub(pattern, '', input_text)
-
-    # Remove any leftover empty lines
-    cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
-    cleaned_text = cleaned_text.replace('(scratchpad)', '')
-    return cleaned_text
-
 
 class TranscriptSegment:
     def __init__(self, text: str, speaker: Character,
                  tts_args: Optional[Dict[str, Any]] = None,
                  auto_clean_markup=True) -> None:
-        self.text = clean_markups(text) if auto_clean_markup else text
+        self.text = self._clean_markups(text) if auto_clean_markup else text
         self.speaker = speaker
         self.tts_args = tts_args or {}
 
+    @staticmethod
+    def _clean_markups(input_text: str) -> str:
+        """
+        Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
+
+        Args:
+            input_text (str): The input text containing TSS markup tags.
+
+        Returns:
+            str: Cleaned text with unsupported TSS markup tags removed.
+        """
+        # List of SSML tags supported by both OpenAI and ElevenLabs
+        supported_tags = [
+            'speak', 'speak', 'lang', 'p', 'phoneme',
+            's', 'say-as', 'sub'
+        ]
+        # Append additional tags to the supported tags list
+        # Create a pattern that matches any tag not in the supported list
+        pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>'
+
+        # Remove unsupported tags
+        cleaned_text = re.sub(pattern, '', input_text)
+
+        # Remove any leftover empty lines
+        cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
+        cleaned_text = cleaned_text.replace('(scratchpad)', '')
+        return cleaned_text
+
     def to_dict(self) -> Dict[str, Any]:
         return {
             "text": self.text,
diff --git a/tests/test_transcript.py b/tests/test_transcript.py
index cb01e395..1af5696e 100644
--- a/tests/test_transcript.py
+++ b/tests/test_transcript.py
@@ -1,5 +1,5 @@
 import pytest
-from podcastfy.core.transcript import clean_markups, TranscriptSegment, Transcript, Character
+from podcastfy.core.transcript import TranscriptSegment, Transcript, Character
 from unittest.mock import patch, mock_open
 
 @pytest.fixture
@@ -11,12 +11,12 @@ def characters():
 def test_clean_markups():
     input_text = "<speak>Hello <unsupported>World</unsupported><prosody rate='slow'>. This is a test</prosody></speak>"
     expected_output = "<speak>Hello World. This is a test</speak>"
-    assert clean_markups(input_text) == expected_output
+    assert TranscriptSegment._clean_markups(input_text) == expected_output
 
 def test_clean_markups_with_scratchpad():
     input_text = "Hello (scratchpad)<prosody pitch='high'>World</prosody>"
     expected_output = "Hello World"
-    assert clean_markups(input_text) == expected_output
+    assert TranscriptSegment._clean_markups(input_text) == expected_output
 
 def test_transcript_segment_init(characters):
     segment = TranscriptSegment("Hello <unsupported>World </unsupported><prosody volume='loud'>Test</prosody>", characters["Person1"])

From bcda52be809cf1aea2c58aa1122bbcff80297a07 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 21:47:21 +0200
Subject: [PATCH 34/49] add comment

---
 podcastfy/core/transcript.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py
index b8860a22..785bd55d 100644
--- a/podcastfy/core/transcript.py
+++ b/podcastfy/core/transcript.py
@@ -80,6 +80,8 @@ def dump(self, filepath: str) -> None:
 
     @staticmethod
     def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]:
+        # in the future, Person should be replaced by any character name, but for now, it's Person
+        # this is tricky because we don't want to take a random tag as a character name, but maybe it's ok to assume that the first tag of each line is the character name
         pattern = r'<Person(\d)>\s*(.*?)\s*</Person\1>'
         matches = re.findall(pattern, content, re.DOTALL)
         return [('Person' + person_num, text) for person_num, text in matches]
@@ -117,11 +119,9 @@ def to_dict(self) -> Dict[str, Any]:
         }
 
     def __str__(self) -> str:
-        """Convert the transcript to a string representation."""
+        """Convert the transcript to a xml representation."""
         lines = []
         for segment in self.segments:
-            lines.append(f"{segment.speaker.name}: {segment.text}")
+            lines.append(f'<{segment.speaker.name}>{segment.text}</{segment.speaker.name}>')
+        return '\n'.join(lines)
 
-        metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()])
-
-        return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines)

From b44a1b798f515c5e9150b55697fb16c1b45a1f6f Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 22:02:55 +0200
Subject: [PATCH 35/49] its currently expected that transcript are
 automatically saved

---
 podcastfy/client_v2.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index 7f598ebc..aa39457a 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -128,15 +128,14 @@ def process_content_v2(
                 characters=characters,
             )
 
-
+        directories = config.get("output_directories")
+        random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
+        random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
+        random_filename_transcript = f"{random_filename_no_suffix}.txt"
         if generate_audio:
             podcast.finalize()
 
             # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
-            random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
-            random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
-            random_filename_transcript = f"{random_filename_no_suffix}.txt"
-            directories = config.get("output_directories")
             audio_file = os.path.join(
                 directories["audio"], random_filename_mp3
             )
@@ -145,6 +144,7 @@ def process_content_v2(
             return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
         else:
             podcast.build_transcript()
+            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
 
         return None # note: should return the podcast object instead, but for the sake of the tests, we return None
     except Exception as e:

From 8ca5fafe5a68e8445ff4677afd6262ece163b197 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 22:04:13 +0200
Subject: [PATCH 36/49] less noise

---
 podcastfy/core/podcast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 2e0c40ff..7e660463 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -44,7 +44,7 @@ def wrapper(self, *args, **kwargs):
             result = func(self, *args, **kwargs)
             next_state = PodcastState(self.state.value + 1)
             self.state = next_state or self.state
-            print(f"Done! Current State: {self.state.name}")
+            print(f"Done!")
             return result
         except Exception as e:
             print(f"Error in {func.__name__}: {str(e)}")

From fe55253915cfaaaa3084382199f86df3e2c7da34 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 22:39:03 +0200
Subject: [PATCH 37/49] fix transcript

---
 must_do_before_merge.txt | 12 ++++++------
 tests/test_transcript.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
index 38e14f3f..b2526da1 100644
--- a/must_do_before_merge.txt
+++ b/must_do_before_merge.txt
@@ -1,8 +1,8 @@
-- one test or two on the Podcast Class
-- delete client_v2 and merge it with client
-- check that all config options are taken
-- remove the excessive prints
-- check that all tts work
+- one test or two on the Podcast Class [x]
+- delete client_v2 and merge it with client [] will be done during PR
+- check that all config options are taken [x]
+- remove the excessive prints [x]
+- check that all tts work [x]
 - ... ?
 - 100% of current pytest unit tests pass  [x] (except for test_generate_podcast_with_custom_config, exhausted credits)
-- 100% of of CLI case scenarios from usage/cli.md
+- 100% of of CLI case scenarios from usage/cli.md [x] except local
diff --git a/tests/test_transcript.py b/tests/test_transcript.py
index 1af5696e..c60ac128 100644
--- a/tests/test_transcript.py
+++ b/tests/test_transcript.py
@@ -83,5 +83,5 @@ def test_transcript_str(characters):
         TranscriptSegment("Hi there", characters["Person2"])
     ]
     transcript = Transcript(segments, {"title": "Test Transcript"})
-    expected_str = "Metadata:\ntitle: Test Transcript\n\nTranscript:\nPerson1: Hello\nPerson2: Hi there"
+    expected_str = "<Person1>Hello</Person1>\n<Person2>Hi there</Person2>"
     assert str(transcript) == expected_str
\ No newline at end of file

From 977f78e28b9dd528d2b4083cf2103c715bbabf7f Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Thu, 17 Oct 2024 22:40:14 +0200
Subject: [PATCH 38/49] remove obsolete todos, and reformulate a todo

---
 podcastfy/aiengines/tts/base.py         | 3 +--
 podcastfy/aiengines/tts/tts_backends.py | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py
index f251a8b2..a776bd08 100644
--- a/podcastfy/aiengines/tts/base.py
+++ b/podcastfy/aiengines/tts/base.py
@@ -54,7 +54,6 @@ class TTSConfigMixin:
     """Mixin class to manage TTS external configurations."""
 
     def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None:
-        # TODO: probably bad config files for final client
         self.name = name
         self.config_file = config_file
         self.default_configs = self._load_default_configs()
@@ -74,7 +73,7 @@ def update_default_config(self, new_config: Dict[str, Any]) -> None:
         self.default_configs.update(new_config)
 
     def tts_config_for_character(self, character: Character) -> TTSConfig:
-        # todo a bit constrained by the fact that the config has just the question and answer fields
+        # note: a bit constrained by the fact that the config has just the question and answer fields
         if character.name in self.character_tts_mapping:
             return self.character_tts_mapping[character.name]
 
diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 37cbf65c..58be2dc6 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -19,7 +19,6 @@ class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin):
     name: str = "elevenlabs"
 
     def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'):
-        # TODO: not the right path for final client
         TTSConfigMixin.__init__(self, config_file, name=self.name)
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
 

From c361a0e1520353bee9ddaaf212606f42cd8c1820 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 00:02:47 +0200
Subject: [PATCH 39/49] update the API to put a more prominent place

---
 podcastfy/client_v2.py    | 15 ++++++------
 podcastfy/core/audio.py   | 25 +++++++++++++-------
 podcastfy/core/podcast.py | 28 ++++++----------------
 tests/test_core_api.py    | 49 ++++++++++++++-------------------------
 4 files changed, 48 insertions(+), 69 deletions(-)

diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
index aa39457a..6e455449 100644
--- a/podcastfy/client_v2.py
+++ b/podcastfy/client_v2.py
@@ -7,6 +7,7 @@
 from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine
 from podcastfy.aiengines.tts.base import TTSBackend
 from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
+from podcastfy.core.audio import AudioManager
 from podcastfy.core.character import Character
 from podcastfy.core.content import Content
 from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend
@@ -97,12 +98,15 @@ def process_content_v2(
             conv_config.configure(conversation_config)
         characters = create_characters(conv_config.config_conversation)
         tts_backends = obtain_tts_backend(config, tts_model)
+        audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format']
+        temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir')
+        audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4)
         if transcript_file:
             logger.info(f"Using transcript file: {transcript_file}")
             transcript = Transcript.load(
                 transcript_file, {char.name: char for char in characters}
             )
-            podcast = Podcast.from_transcript(transcript, tts_backends, characters)
+            podcast = Podcast.from_transcript(transcript, audio_manager, characters)
         else:
             logger.info(f"Processing {len(urls)} links")
             content_extractor = ContentExtractor()
@@ -118,13 +122,10 @@ def process_content_v2(
                 llm_contents.extend(
                     [Content(value=image_path, type="image_path") for image_path in image_paths]
                 )
-
-
-
             podcast = Podcast(
                 content=llm_contents,
                 llm_backend=content_generator,
-                tts_backends=tts_backends,
+                audio_manager=audio_manager,
                 characters=characters,
             )
 
@@ -152,9 +153,9 @@ def process_content_v2(
         raise
 
 
-def obtain_tts_backend(config, tts_model):
+def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]:
     # temporary solution
     tts_backends = create_tts_backends(config)
     # filter out the tts backends that are not in the tts_model, temporary solution
-    tts_backends = [tts for tts in tts_backends if tts.name == tts_model]
+    tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model}
     return tts_backends
diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index 663f9d59..ab6fab77 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -1,11 +1,13 @@
 import asyncio
+import atexit
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Optional, Dict, Union, List, cast, Tuple
 
 from pydub import AudioSegment
 
-from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend
+from podcastfy.aiengines.tts.base import TTSBackend, SyncTTSBackend, AsyncTTSBackend
 from podcastfy.core.transcript import TranscriptSegment, Transcript
 
 
@@ -26,20 +28,25 @@ def audio(self) -> AudioSegment:
 
 
 class AudioManager:
-    def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4, file_prefix: str = "") -> None:
+    def __init__(self, tts_backends: Dict[str, TTSBackend], audio_format, n_jobs: int = 4, file_prefix: str = "", audio_temp_dir: str = None) -> None:
+        self.audio_format = audio_format
         self.tts_backends = tts_backends
         self.n_jobs = n_jobs
         self.has_async_backend = any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values())
         self.file_prefix = file_prefix
-        self.audio_segments = []
         self.final_audio: Optional[AudioSegment] = None
-        self.temp_dir: Optional[Union[str, Path]] = None
+        if audio_temp_dir:
+            self.temp_dir = Path(audio_temp_dir)
+        else:
+            self._temp_dir = TemporaryDirectory()
+            self.temp_dir = Path(self._temp_dir.name)
+            atexit.register(self._temp_dir.cleanup)
 
     async def _async_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
         async def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
             segment, index = segment_tuple
-            tts_backend = self.get_tts_backend(segment)
-            audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.mp3"
+            tts_backend = self._get_tts_backend(segment)
+            audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.{self.audio_format}"
             if isinstance(tts_backend, AsyncTTSBackend):
                 await tts_backend.async_text_to_speech(
                     segment.text,
@@ -63,7 +70,7 @@ async def bounded_process_segment(segment_tuple):
         tasks = [asyncio.create_task(bounded_process_segment((segment, i))) for i, segment in enumerate(transcript.segments)]
         return list(await asyncio.gather(*tasks))
 
-    def get_tts_backend(self, segment):
+    def _get_tts_backend(self, segment):
         tts_backend = self.tts_backends.get(segment.speaker.preferred_tts)
         if tts_backend is None:
             # Take the first available TTS backend
@@ -73,8 +80,8 @@ def get_tts_backend(self, segment):
     def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]:
         def process_segment(segment_tuple: Tuple[TranscriptSegment, int]):
             segment, index = segment_tuple
-            tts_backend = self.get_tts_backend(segment)
-            filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3"
+            tts_backend = self._get_tts_backend(segment)
+            filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.{self.audio_format}"
             cast(SyncTTSBackend, tts_backend).text_to_speech(
                 segment.text,
                 segment.speaker,
diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 7e660463..3a93f951 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -57,9 +57,8 @@ class Podcast:
     """Main class for podcast creation and management."""
 
     def __init__(self, content: List[Content], llm_backend: LLMBackend,
-                 tts_backends: List[TTSBackend], audio_temp_dir: Optional[Union[str, Path]] = None,
-                 characters: Optional[List[Character]] = None,
-                 default_tts_n_jobs: int = 1) -> None:
+                 audio_manager: AudioManager,
+                 characters: Optional[List[Character]] = None):
         """
         Initialize a new Podcast instance.
 
@@ -78,20 +77,10 @@ def __init__(self, content: List[Content], llm_backend: LLMBackend,
         """
         self.content = content
         self.llm_backend = llm_backend
-        self.tts_backends: Dict[str, TTSBackend] = {backend.name: backend for backend in tts_backends}
         self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])}
-        self.default_tts_n_jobs = default_tts_n_jobs
         self.state = PodcastState.INITIALIZED
         self._reworking = False
-        
-        if audio_temp_dir:
-            self.temp_dir = Path(audio_temp_dir)
-        else:
-            self._temp_dir = TemporaryDirectory()
-            self.temp_dir = Path(self._temp_dir.name)
-            atexit.register(self._temp_dir.cleanup)
-        self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs)
-        self.audio_manager.temp_dir = self.temp_dir
+        self.audio_manager = audio_manager
 
         # Initialize attributes with null values
         self.transcript: Optional[Transcript] = None
@@ -111,23 +100,20 @@ def __del__(self) -> None:
 
     @classmethod
     def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript],
-                        tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]],
-                        characters: List[Character], default_tts_n_jobs: int = 1) -> 'Podcast':
+                        audio_manager: AudioManager,
+                        characters: List[Character]) -> 'Podcast':
         """
         Create a Podcast instance from a pre-existing transcript.
 
         Args:
             transcript (Union[Sequence[Tuple[str, str]], Transcript]): Pre-existing transcript.
-            tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends.
+            audio_manager (AudioManager): The audio manager instance for creating audio segments.
             characters (List[Character]): List of characters participating in the podcast.
-            default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing.
-                Defaults to 1.
-
         Returns:
             Podcast: A new Podcast instance with the transcript built and ready for audio generation.
         """
         if isinstance(transcript, Transcript):
-            podcast = cls("", cast(LLMBackend, None), tts_backends, characters=characters, default_tts_n_jobs=default_tts_n_jobs)
+            podcast = cls("", cast(LLMBackend, None), audio_manager=audio_manager, characters=characters)
             podcast.transcript = transcript
         else:
             raise ValueError("Transcript must be a Transcript instance")  # unimplemented
diff --git a/tests/test_core_api.py b/tests/test_core_api.py
index fba450d1..33cf4571 100644
--- a/tests/test_core_api.py
+++ b/tests/test_core_api.py
@@ -6,11 +6,10 @@
 from podcastfy.core.content import Content
 from podcastfy.core.podcast import Podcast, PodcastState
 from podcastfy.aiengines.llm.base import LLMBackend
-from podcastfy.aiengines.tts.base import SyncTTSBackend
 from podcastfy.core.character import Character
 from podcastfy.core.tts_configs import TTSConfig
 from podcastfy.core.transcript import TranscriptSegment, Transcript
-
+from podcastfy.core.audio import AudioManager
 
 class DummyLLMBackend(LLMBackend):
     def generate_transcript(self, content, characters):
@@ -19,8 +18,7 @@ def generate_transcript(self, content, characters):
             (characters[1], "Thanks for having me!")
         ]
 
-
-class DummyTTSBackend(SyncTTSBackend):
+class DummyTTSBackend:
     def __init__(self, name: str):
         self.name = name
 
@@ -29,16 +27,15 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) ->
         audio.export(str(output_path), format="mp3")
         return output_path
 
-
 @pytest.fixture
-def tts_backends():
-    return [DummyTTSBackend("openai"), DummyTTSBackend("elevenlabs")]
-
+def audio_manager(tmp_path):
+    tts_backends = {"openai": DummyTTSBackend("openai"), "elevenlabs": DummyTTSBackend("elevenlabs")}
+    return AudioManager(tts_backends, audio_format="mp3", audio_temp_dir=tmp_path, n_jobs=1)
 
 @pytest.fixture
 def characters():
     host = Character(
-        name="Host",
+        name="Person1",
         role="Podcast host",
         tts_configs={
             "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}),
@@ -48,7 +45,7 @@ def characters():
     )
 
     guest = Character(
-        name="Guest",
+        name="Person2",
         role="Expert guest",
         tts_configs={
             "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}),
@@ -59,38 +56,32 @@ def characters():
 
     return [host, guest]
 
-
 @pytest.fixture
-def podcast(tts_backends, characters):
+def podcast(audio_manager, characters):
     return Podcast(
         content=[Content(value="This is a sample content for our podcast.", type="text")],
         llm_backend=DummyLLMBackend(),
-        tts_backends=tts_backends,
+        audio_manager=audio_manager,
         characters=characters,
     )
 
-
 def test_podcast_initialization(podcast):
     assert podcast.state == PodcastState.INITIALIZED
     assert podcast.transcript is None
-    assert podcast.audio_segments == []
     assert podcast.audio is None
 
-
 def test_build_transcript(podcast):
     podcast.build_transcript()
     assert podcast.state == PodcastState.TRANSCRIPT_BUILT
     assert isinstance(podcast.transcript, Transcript)
     assert len(podcast.transcript.segments) == 2
 
-
 def test_build_audio_segments(podcast):
     podcast.build_transcript()
     podcast.build_audio_segments()
     assert podcast.state == PodcastState.AUDIO_SEGMENTS_BUILT
     assert len(podcast.audio_segments) == 2
 
-
 def test_stitch_audio_segments(podcast):
     podcast.build_transcript()
     podcast.build_audio_segments()
@@ -98,7 +89,6 @@ def test_stitch_audio_segments(podcast):
     assert podcast.state == PodcastState.STITCHED
     assert isinstance(podcast.audio, AudioSegment)
 
-
 def test_finalize(podcast):
     podcast.finalize()
     assert podcast.state == PodcastState.STITCHED
@@ -106,45 +96,41 @@ def test_finalize(podcast):
     assert len(podcast.audio_segments) > 0
     assert isinstance(podcast.audio, AudioSegment)
 
-
 def test_save(podcast, tmp_path):
     podcast.finalize()
     output_file = tmp_path / "test_podcast.mp3"
     podcast.save(str(output_file))
     assert output_file.exists()
 
-
 def test_export_transcript(podcast, tmp_path):
     podcast.finalize()
     output_file = tmp_path / "test_transcript.txt"
     podcast.export_transcript(str(output_file), format_="plaintext")
     assert output_file.exists()
 
-
 def test_rework(podcast):
     podcast.finalize()
 
     with podcast.rework(PodcastState.TRANSCRIPT_BUILT):
         assert podcast.state == PodcastState.TRANSCRIPT_BUILT
         podcast.transcript.segments.append(
-            TranscriptSegment("This is a new segment", podcast.characters["Host"]))
+            TranscriptSegment("This is a new segment", podcast.characters["Person1"]))
 
     assert podcast.state == PodcastState.STITCHED
     assert len(podcast.transcript.segments) == 3
 
-
-def test_from_transcript(tts_backends, characters):
+def test_from_transcript(audio_manager, characters):
     pre_existing_transcript = [
-        ("Host", "Welcome to our podcast created from a pre-existing transcript!"),
-        ("Guest", "Thank you for having me. I'm excited to be here.")
+        ("Person1", "Welcome to our podcast created from a pre-existing transcript!"),
+        ("Person2", "Thank you for having me. I'm excited to be here.")
     ]
 
     podcast = Podcast.from_transcript(
         transcript=Transcript([
-            TranscriptSegment(text, characters[0] if speaker == "Host" else characters[1])
+            TranscriptSegment(text, characters[0] if speaker == "Person1" else characters[1])
             for speaker, text in pre_existing_transcript
         ]),
-        tts_backends=tts_backends,
+        audio_manager=audio_manager,
         characters=characters
     )
 
@@ -154,8 +140,7 @@ def test_from_transcript(tts_backends, characters):
     podcast.finalize()
     assert podcast.state == PodcastState.STITCHED
 
-
-def test_load_transcript(tts_backends, characters, tmp_path):
+def test_load_transcript(audio_manager, characters, tmp_path):
     # Create a dummy transcript file
     transcript_file = tmp_path / "test_transcript.json"
     Transcript([
@@ -163,6 +148,6 @@ def test_load_transcript(tts_backends, characters, tmp_path):
         TranscriptSegment("Thank you for having me!", characters[1])
     ]).dump(str(transcript_file))
 
-    podcast = Podcast.load_transcript(str(transcript_file), tts_backends, characters)
+    podcast = Podcast.load_transcript(str(transcript_file), audio_manager, characters)
     assert podcast.state == PodcastState.TRANSCRIPT_BUILT
     assert len(podcast.transcript.segments) == 2
\ No newline at end of file

From 61c42af6ad913eb7218c911e0412f6d61548d9c8 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 00:32:00 +0200
Subject: [PATCH 40/49] remove temp file

---
 must_do_before_merge.txt | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 must_do_before_merge.txt

diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt
deleted file mode 100644
index b2526da1..00000000
--- a/must_do_before_merge.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-- one test or two on the Podcast Class [x]
-- delete client_v2 and merge it with client [] will be done during PR
-- check that all config options are taken [x]
-- remove the excessive prints [x]
-- check that all tts work [x]
-- ... ?
-- 100% of current pytest unit tests pass  [x] (except for test_generate_podcast_with_custom_config, exhausted credits)
-- 100% of of CLI case scenarios from usage/cli.md [x] except local

From 17c14720d969147f3fc7d87e6db0ef3841166d82 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 01:55:27 +0200
Subject: [PATCH 41/49] rework audio tests and add pytest-asyncio in the
 dependencies

---
 pyproject.toml      |  2 +
 requirements.txt    |  1 +
 tests/test_audio.py | 98 +++++++++++++++++++++++----------------------
 3 files changed, 53 insertions(+), 48 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9fb07aa4..4758f2eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,10 +44,12 @@ types-pyyaml = "^6.0.12.20240917"
 nest-asyncio = "^1.6.0"
 ffmpeg = "^1.4"
 pytest = "^8.3.3"
+pytest-asyncio = "^0.24.0"
 
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.3"
+pytest-asyncio = "^0.24.0"
 black = "^24.8.0"
 sphinx = ">=8.0.2"
 nbsphinx = "0.9.5"
diff --git a/requirements.txt b/requirements.txt
index e24bccf3..645987c7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -108,6 +108,7 @@ pygments==2.18.0 ; python_version >= "3.11" and python_version < "4.0"
 pymupdf==1.24.11 ; python_version >= "3.11" and python_version < "4.0"
 pyparsing==3.2.0 ; python_version >= "3.11" and python_version < "4.0"
 pytest==8.3.3 ; python_version >= "3.11" and python_version < "4.0"
+pytest-asyncio==0.24.0 ; python_version >= "3.11" and python_version < "4.0"
 python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "4.0"
 python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "4.0"
 python-levenshtein==0.26.0 ; python_version >= "3.11" and python_version < "4.0"
diff --git a/tests/test_audio.py b/tests/test_audio.py
index 9e72d044..77fe5047 100644
--- a/tests/test_audio.py
+++ b/tests/test_audio.py
@@ -1,50 +1,52 @@
-import unittest
 import pytest
 import os
-from podcastfy.text_to_speech import TextToSpeech
-
-
-class TestAudio(unittest.TestCase):
-    def setUp(self):
-        self.test_text = "<Person1>Hello, how are you?</Person1><Person2>I'm doing great, thanks for asking!</Person2>"
-        self.output_dir = "tests/data/audio"
-        os.makedirs(self.output_dir, exist_ok=True)
-
-    @pytest.mark.skip(reason="Testing edge only on Github Action as it's free")
-    def test_text_to_speech_openai(self):
-        tts = TextToSpeech(model="openai")
-        output_file = os.path.join(self.output_dir, "test_openai.mp3")
-        tts.convert_to_speech(self.test_text, output_file)
-
-        self.assertTrue(os.path.exists(output_file))
-        self.assertGreater(os.path.getsize(output_file), 0)
-
-        # Clean up
-        os.remove(output_file)
-
-    @pytest.mark.skip(reason="Testing edge only on Github Action as it's free")
-    def test_text_to_speech_elevenlabs(self):
-        tts = TextToSpeech(model="elevenlabs")
-        output_file = os.path.join(self.output_dir, "test_elevenlabs.mp3")
-        tts.convert_to_speech(self.test_text, output_file)
-
-        self.assertTrue(os.path.exists(output_file))
-        self.assertGreater(os.path.getsize(output_file), 0)
-
-        # Clean up
-        os.remove(output_file)
-
-    def test_text_to_speech_edge(self):
-        tts = TextToSpeech(model="edge")
-        output_file = os.path.join(self.output_dir, "test_edge.mp3")
-        tts.convert_to_speech(self.test_text, output_file)
-
-        self.assertTrue(os.path.exists(output_file))
-        self.assertGreater(os.path.getsize(output_file), 0)
-
-        # Clean up
-        os.remove(output_file)
-
-
-if __name__ == "__main__":
-    unittest.main()
+from pathlib import Path
+from podcastfy.core.character import Character
+from podcastfy.aiengines.tts.tts_backends import ElevenLabsTTS, OpenAITTS, EdgeTTS
+
+@pytest.fixture
+def test_setup():
+    test_text = "<Person1>Hello, how are you?</Person1><Person2>I'm doing great, thanks for asking!</Person2>"
+    output_dir = Path("tests/data/audio")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    dummy_character = Character("test_character", "host", {}, "A test character")
+    return test_text, output_dir, dummy_character
+
+@pytest.mark.skip(reason="Testing Eleven Labs only on Github Action as it requires API key")
+def test_text_to_speech_elevenlabs(test_setup):
+    test_text, output_dir, dummy_character = test_setup
+    tts = ElevenLabsTTS()
+    output_file = output_dir / "test_elevenlabs.mp3"
+    tts.text_to_speech(test_text, dummy_character, output_file)
+
+    assert output_file.exists()
+    assert output_file.stat().st_size > 0
+
+    # Clean up
+    output_file.unlink()
+
+@pytest.mark.skip(reason="Testing OpenAI only on Github Action as it requires API key")
+def test_text_to_speech_openai(test_setup):
+    test_text, output_dir, dummy_character = test_setup
+    tts = OpenAITTS()
+    output_file = output_dir / "test_openai.mp3"
+    tts.text_to_speech(test_text, dummy_character, output_file)
+
+    assert output_file.exists()
+    assert output_file.stat().st_size > 0
+
+    # Clean up
+    output_file.unlink()
+
+@pytest.mark.asyncio
+async def test_text_to_speech_edge(test_setup):
+    test_text, output_dir, dummy_character = test_setup
+    tts = EdgeTTS()
+    output_file = output_dir / "test_edge.mp3"
+    await tts.async_text_to_speech(test_text, dummy_character, output_file)
+
+    assert output_file.exists()
+    assert output_file.stat().st_size > 0
+
+    # Clean up
+    output_file.unlink()
\ No newline at end of file

From a2f9c1e41e84b0cbac229f7b034f49d2e3d53dec Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 07:28:47 +0200
Subject: [PATCH 42/49] clean unused module, merge back into client.py

---
 podcastfy/client.py         | 241 ++++++++++++++----------
 podcastfy/client_v2.py      | 161 ----------------
 podcastfy/text_to_speech.py | 353 ------------------------------------
 3 files changed, 146 insertions(+), 609 deletions(-)
 delete mode 100644 podcastfy/client_v2.py
 delete mode 100644 podcastfy/text_to_speech.py

diff --git a/podcastfy/client.py b/podcastfy/client.py
index 86917a43..d6a4a467 100644
--- a/podcastfy/client.py
+++ b/podcastfy/client.py
@@ -5,120 +5,171 @@
 from URLs or existing transcript files. It orchestrates the content extraction,
 generation, and text-to-speech conversion processes.
 """
+import copy
 
 import os
 import uuid
 import typer
 import yaml
+
+from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine
+from podcastfy.aiengines.tts.base import TTSBackend
+from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
+from podcastfy.core.audio import AudioManager
+from podcastfy.core.character import Character
+from podcastfy.core.content import Content
+from podcastfy.core.podcast import Podcast
+from podcastfy.core.transcript import Transcript
 from podcastfy.content_parser.content_extractor import ContentExtractor
-from podcastfy.content_generator import ContentGenerator
-from podcastfy.text_to_speech import TextToSpeech
+from podcastfy.core.tts_configs import TTSConfig
 from podcastfy.utils.config import Config, load_config
 from podcastfy.utils.config_conversation import (
-    ConversationConfig,
     load_conversation_config,
 )
 from podcastfy.utils.logger import setup_logger
 from typing import List, Optional, Dict, Any
-import copy
 from podcastfy.client_v2 import process_content_v2 as process_content
 
-
 logger = setup_logger(__name__)
 
 app = typer.Typer()
 
+def create_characters(config: Dict[str, Any]) -> List[Character]:
+    # in the future, we should load this from the config file
+    host = Character(
+        name="Person1",
+        role="Podcast host",
+        tts_configs={
+            "openai": TTSConfig(
+                voice=config["text_to_speech"]["openai"]["default_voices"]["question"],
+                backend="openai",
+            ),
+            "elevenlabs": TTSConfig(
+                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
+                    "question"
+                ],
+                backend="elevenlabs",
+            ),
+        },
+        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.",
+    )
+
+    guest = Character(
+        name="Person2",
+        role="Expert guest",
+        tts_configs={
+            "openai": TTSConfig(
+                voice=config["text_to_speech"]["openai"]["default_voices"]["answer"],
+                backend="openai",
+            ),
+            "elevenlabs": TTSConfig(
+                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
+                    "answer"
+                ],
+                backend="elevenlabs",
+            ),
+        },
+        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.",
+    )
+
+    return [host, guest]
+
+
+def create_tts_backends(config: Config) -> List[TTSBackend]:
+    return [
+        OpenAITTS(api_key=config.OPENAI_API_KEY),
+        ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY),
+        EdgeTTS(),
+    ]
+
+
+
+def process_content(
+        urls: Optional[List[str]] = None,
+        transcript_file: Optional[str] = None,
+        tts_model: str = "openai",  # to be fixed, in case of characters, it should be a list of models
+        generate_audio: bool = True,
+        config: Optional[Config] = None,
+        conversation_config: Optional[Dict[str, Any]] = None,
+        image_paths: Optional[List[str]] = None,
+        is_local: bool = False,
+) -> str:
+    try:
+        if config is None:
+            config = load_config()
+        if urls is None:
+            urls = []
+            if config is None:
+                config = load_config()
+        # Load default conversation config
+        conv_config = load_conversation_config()
+
+        # Update with provided config if any
+        if conversation_config:
+            conv_config.configure(conversation_config)
+        characters = create_characters(conv_config.config_conversation)
+        tts_backends = obtain_tts_backend(config, tts_model)
+        audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format']
+        temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir')
+        audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4)
+        if transcript_file:
+            logger.info(f"Using transcript file: {transcript_file}")
+            transcript = Transcript.load(
+                transcript_file, {char.name: char for char in characters}
+            )
+            podcast = Podcast.from_transcript(transcript, audio_manager, characters)
+        else:
+            logger.info(f"Processing {len(urls)} links")
+            content_extractor = ContentExtractor()
+            content_generator = DefaultPodcastifyTranscriptEngine(
+                config.GEMINI_API_KEY, conversation_config, is_local=is_local
+            )
+
+            contents = [content_extractor.extract_content(url) for url in urls]
+            llm_contents = []
+            if contents:
+                llm_contents.append(Content(value="\n\n".join(contents), type="text"))
+            if image_paths:
+                llm_contents.extend(
+                    [Content(value=image_path, type="image_path") for image_path in image_paths]
+                )
+            podcast = Podcast(
+                content=llm_contents,
+                llm_backend=content_generator,
+                audio_manager=audio_manager,
+                characters=characters,
+            )
+
+        directories = config.get("output_directories")
+        random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
+        random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
+        random_filename_transcript = f"{random_filename_no_suffix}.txt"
+        if generate_audio:
+            podcast.finalize()
+
+            # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
+            audio_file = os.path.join(
+                directories["audio"], random_filename_mp3
+            )
+            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
+            podcast.save(filepath=audio_file)
+            return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
+        else:
+            podcast.build_transcript()
+            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
+
+        return None # note: should return the podcast object instead, but for the sake of the tests, we return None
+    except Exception as e:
+        logger.error(f"An error occurred in the process_content function: {str(e)}")
+        raise
+
 
-# def process_content(
-#     urls=None,
-#     transcript_file=None,
-#     tts_model="openai",
-#     generate_audio=True,
-#     config=None,
-#     conversation_config: Optional[Dict[str, Any]] = None,
-#     image_paths: Optional[List[str]] = None,
-#     is_local: bool = False,
-# ):
-#     """
-#     Process URLs, a transcript file, or image paths to generate a podcast or transcript.
-#
-#     Args:
-#         urls (Optional[List[str]]): A list of URLs to process.
-#         transcript_file (Optional[str]): Path to a transcript file.
-#         tts_model (str): The TTS model to use ('openai', 'elevenlabs' or 'edge'). Defaults to 'openai'.
-#         generate_audio (bool): Whether to generate audio or just a transcript. Defaults to True.
-#         config (Config): Configuration object to use. If None, default config will be loaded.
-#         conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
-#         image_paths (Optional[List[str]]): List of image file paths to process.
-#         is_local (bool): Whether to use a local LLM. Defaults to False.
-#
-#     Returns:
-#         Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
-#     """
-#     try:
-#         if config is None:
-#             config = load_config()
-#
-#         # Load default conversation config
-#         conv_config = load_conversation_config()
-#
-#         # Update with provided config if any
-#         if conversation_config:
-#             conv_config.configure(conversation_config)
-#
-#         if transcript_file:
-#             logger.info(f"Using transcript file: {transcript_file}")
-#             with open(transcript_file, "r") as file:
-#                 qa_content = file.read()
-#         else:
-#             content_generator = ContentGenerator(
-#                 api_key=config.GEMINI_API_KEY, conversation_config=conv_config.to_dict()
-#             )
-#
-#             if urls:
-#                 logger.info(f"Processing {len(urls)} links")
-#                 content_extractor = ContentExtractor()
-#                 # Extract content from links
-#                 contents = [content_extractor.extract_content(link) for link in urls]
-#                 # Combine all extracted content
-#                 combined_content = "\n\n".join(contents)
-#             else:
-#                 combined_content = ""  # Empty string if no URLs provided
-#
-#             # Generate Q&A content
-#             random_filename = f"transcript_{uuid.uuid4().hex}.txt"
-#             transcript_filepath = os.path.join(
-#                 config.get("output_directories")["transcripts"], random_filename
-#             )
-#             qa_content = content_generator.generate_qa_content(
-#                 combined_content,
-#                 image_file_paths=image_paths or [],
-#                 output_filepath=transcript_filepath,
-#                 is_local=is_local,
-#             )
-#
-#         if generate_audio:
-#             api_key = None
-#             # edge does not require an API key
-#             if tts_model != "edge":
-#                 api_key = getattr(config, f"{tts_model.upper()}_API_KEY")
-#
-#             text_to_speech = TextToSpeech(model=tts_model, api_key=api_key)
-#             # Convert text to speech using the specified model
-#             random_filename = f"podcast_{uuid.uuid4().hex}.mp3"
-#             audio_file = os.path.join(
-#                 config.get("output_directories")["audio"], random_filename
-#             )
-#             text_to_speech.convert_to_speech(qa_content, audio_file)
-#             logger.info(f"Podcast generated successfully using {tts_model} TTS model")
-#             return audio_file
-#         else:
-#             logger.info(f"Transcript generated successfully")
-#             return None
-#
-#     except Exception as e:
-#         logger.error(f"An error occurred in the process_content function: {str(e)}")
-#         raise
+def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]:
+    # temporary solution
+    tts_backends = create_tts_backends(config)
+    # filter out the tts backends that are not in the tts_model, temporary solution
+    tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model}
+    return tts_backends
 
 
 @app.command()
diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py
deleted file mode 100644
index 6e455449..00000000
--- a/podcastfy/client_v2.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import os
-import uuid
-import typer
-from pathlib import Path
-from typing import List, Optional, Dict, Any, Union, Tuple
-
-from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine
-from podcastfy.aiengines.tts.base import TTSBackend
-from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS
-from podcastfy.core.audio import AudioManager
-from podcastfy.core.character import Character
-from podcastfy.core.content import Content
-from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend
-from podcastfy.core.transcript import Transcript
-from podcastfy.content_parser.content_extractor import ContentExtractor
-from podcastfy.core.tts_configs import TTSConfig
-from podcastfy.utils.config import Config, load_config
-from podcastfy.utils.config_conversation import load_conversation_config
-from podcastfy.utils.logger import setup_logger
-
-logger = setup_logger(__name__)
-
-app = typer.Typer()
-
-
-def create_characters(config: Dict[str, Any]) -> List[Character]:
-    # in the future, we should load this from the config file
-    host = Character(
-        name="Person1",
-        role="Podcast host",
-        tts_configs={
-            "openai": TTSConfig(
-                voice=config["text_to_speech"]["openai"]["default_voices"]["question"],
-                backend="openai",
-            ),
-            "elevenlabs": TTSConfig(
-                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
-                    "question"
-                ],
-                backend="elevenlabs",
-            ),
-        },
-        default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.",
-    )
-
-    guest = Character(
-        name="Person2",
-        role="Expert guest",
-        tts_configs={
-            "openai": TTSConfig(
-                voice=config["text_to_speech"]["openai"]["default_voices"]["answer"],
-                backend="openai",
-            ),
-            "elevenlabs": TTSConfig(
-                voice=config["text_to_speech"]["elevenlabs"]["default_voices"][
-                    "answer"
-                ],
-                backend="elevenlabs",
-            ),
-        },
-        default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.",
-    )
-
-    return [host, guest]
-
-
-def create_tts_backends(config: Config) -> List[TTSBackend]:
-    return [
-        OpenAITTS(api_key=config.OPENAI_API_KEY),
-        ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY),
-        EdgeTTS(),
-    ]
-
-
-
-def process_content_v2(
-    urls: Optional[List[str]] = None,
-    transcript_file: Optional[str] = None,
-    tts_model: str = "openai",  # to be fixed, in case of characters, it should be a list of models
-    generate_audio: bool = True,
-    config: Optional[Config] = None,
-    conversation_config: Optional[Dict[str, Any]] = None,
-    image_paths: Optional[List[str]] = None,
-    is_local: bool = False,
-) -> Tuple[Optional[str], Podcast]:
-    try:
-        if config is None:
-            config = load_config()
-        if urls is None:
-            urls = []
-            if config is None:
-                config = load_config()
-        # Load default conversation config
-        conv_config = load_conversation_config()
-
-        # Update with provided config if any
-        if conversation_config:
-            conv_config.configure(conversation_config)
-        characters = create_characters(conv_config.config_conversation)
-        tts_backends = obtain_tts_backend(config, tts_model)
-        audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format']
-        temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir')
-        audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4)
-        if transcript_file:
-            logger.info(f"Using transcript file: {transcript_file}")
-            transcript = Transcript.load(
-                transcript_file, {char.name: char for char in characters}
-            )
-            podcast = Podcast.from_transcript(transcript, audio_manager, characters)
-        else:
-            logger.info(f"Processing {len(urls)} links")
-            content_extractor = ContentExtractor()
-            content_generator = DefaultPodcastifyTranscriptEngine(
-                config.GEMINI_API_KEY, conversation_config, is_local=is_local
-            )
-
-            contents = [content_extractor.extract_content(url) for url in urls]
-            llm_contents = []
-            if contents:
-                llm_contents.append(Content(value="\n\n".join(contents), type="text"))
-            if image_paths:
-                llm_contents.extend(
-                    [Content(value=image_path, type="image_path") for image_path in image_paths]
-                )
-            podcast = Podcast(
-                content=llm_contents,
-                llm_backend=content_generator,
-                audio_manager=audio_manager,
-                characters=characters,
-            )
-
-        directories = config.get("output_directories")
-        random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
-        random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
-        random_filename_transcript = f"{random_filename_no_suffix}.txt"
-        if generate_audio:
-            podcast.finalize()
-
-            # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object
-            audio_file = os.path.join(
-                directories["audio"], random_filename_mp3
-            )
-            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
-            podcast.save(filepath=audio_file)
-            return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
-        else:
-            podcast.build_transcript()
-            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
-
-        return None # note: should return the podcast object instead, but for the sake of the tests, we return None
-    except Exception as e:
-        logger.error(f"An error occurred in the process_content function: {str(e)}")
-        raise
-
-
-def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]:
-    # temporary solution
-    tts_backends = create_tts_backends(config)
-    # filter out the tts backends that are not in the tts_model, temporary solution
-    tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model}
-    return tts_backends
diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py
deleted file mode 100644
index 977272eb..00000000
--- a/podcastfy/text_to_speech.py
+++ /dev/null
@@ -1,353 +0,0 @@
-"""
-Text-to-Speech Module
-
-This module provides functionality to convert text into speech using various TTS models.
-It supports both ElevenLabs, OpenAI and Edge TTS services and handles the conversion process,
-including cleaning of input text and merging of audio files.
-"""
-
-import logging
-import asyncio
-import edge_tts
-from elevenlabs import client as elevenlabs_client
-from podcastfy.utils.config import load_config
-from podcastfy.utils.config_conversation import load_conversation_config
-from pydub import AudioSegment
-import os
-import re
-import openai
-from typing import List, Tuple, Optional, Union
-
-logger = logging.getLogger(__name__)
-
-class TextToSpeech:
-	def __init__(self, model: str = 'openai', api_key: Optional[str] = None):
-		"""
-		Initialize the TextToSpeech class.
-
-		Args:
-			model (str): The model to use for text-to-speech conversion. 
-						 Options are 'elevenlabs', 'openai' or 'edge'. Defaults to 'openai'.
-			api_key (Optional[str]): API key for the selected text-to-speech service.
-						   If not provided, it will be loaded from the config.
-		"""
-		self.model = model.lower()
-		self.config = load_config()
-		self.conversation_config = load_conversation_config()
-		self.tts_config = self.conversation_config.get('text_to_speech')
-
-		if self.model == 'elevenlabs':
-			self.api_key = api_key or self.config.ELEVENLABS_API_KEY
-			self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key)
-		elif self.model == 'openai':
-			self.api_key = api_key or self.config.OPENAI_API_KEY
-			openai.api_key = self.api_key
-		elif self.model == 'edge':
-			pass
-		else:
-			raise ValueError("Invalid model. Choose 'elevenlabs', 'openai' or 'edge'.")
-
-		self.audio_format = self.tts_config['audio_format']
-		self.temp_audio_dir = self.tts_config['temp_audio_dir']
-		self.ending_message = self.tts_config['ending_message']
-
-		# Create temp_audio_dir if it doesn't exist
-		if not os.path.exists(self.temp_audio_dir):
-			os.makedirs(self.temp_audio_dir)
-
-	def __merge_audio_files(self, input_dir: str, output_file: str) -> None:
-		"""
-		Merge all audio files in the input directory sequentially and save the result.
-
-		Args:
-			input_dir (str): Path to the directory containing audio files.
-			output_file (str): Path to save the merged audio file.
-		"""
-		try:
-			# Function to sort filenames naturally
-			def natural_sort_key(filename: str) -> List[Union[int, str]]:
-				return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)]
-			
-			combined = AudioSegment.empty()
-			audio_files = sorted(
-				[f for f in os.listdir(input_dir) if f.endswith(f".{self.audio_format}")],
-				key=natural_sort_key
-			)
-			for file in audio_files:
-				if file.endswith(f".{self.audio_format}"):
-					file_path = os.path.join(input_dir, file)
-					combined += AudioSegment.from_file(file_path, format=self.audio_format)
-			
-			combined.export(output_file, format=self.audio_format)
-			logger.info(f"Merged audio saved to {output_file}")
-		except Exception as e:
-			logger.error(f"Error merging audio files: {str(e)}")
-			raise
-
-	def convert_to_speech(self, text: str, output_file: str) -> None:
-		"""
-		Convert input text to speech and save as an audio file.
-
-		Args:
-			text (str): Input text to convert to speech.
-			output_file (str): Path to save the output audio file.
-
-		Raises:
-			Exception: If there's an error in converting text to speech.
-		"""
-		# Clean TSS markup tags from the input text
-		cleaned_text = self.clean_tss_markup(text)
-
-		if self.model == 'elevenlabs':
-			self.__convert_to_speech_elevenlabs(cleaned_text, output_file)
-		elif self.model == 'openai':
-			self.__convert_to_speech_openai(cleaned_text, output_file)
-		elif self.model == 'edge':
-			self.__convert_to_speech_edge(cleaned_text, output_file)
-
-	def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None:
-		try:
-			qa_pairs = self.split_qa(text)
-			audio_files = []
-			counter = 0
-			for question, answer in qa_pairs:
-				question_audio = self.client.generate(
-					text=question,
-					voice=self.tts_config['elevenlabs']['default_voices']['question'],
-					model=self.tts_config['elevenlabs']['model']
-				)
-				answer_audio = self.client.generate(
-					text=answer,
-					voice=self.tts_config['elevenlabs']['default_voices']['answer'],
-					model=self.tts_config['elevenlabs']['model']
-				)
-
-				# Save question and answer audio chunks
-				for audio in [question_audio, answer_audio]:
-					counter += 1
-					file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
-					with open(file_name, "wb") as out:
-						for chunk in audio:
-							if chunk:
-								out.write(chunk)
-					audio_files.append(file_name)
-
-			# Merge all audio files and save the result
-			self.__merge_audio_files(self.temp_audio_dir, output_file)
-
-			# Clean up individual audio files
-			for file in audio_files:
-				os.remove(file)
-			
-			logger.info(f"Audio saved to {output_file}")
-
-		except Exception as e:
-			logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}")
-			raise
-
-	def __convert_to_speech_openai(self, text: str, output_file: str) -> None:
-		try:
-			qa_pairs = self.split_qa(text)
-			print(qa_pairs)
-			audio_files = []
-			counter = 0
-			for question, answer in qa_pairs:
-				for speaker, content in [
-					(self.tts_config['openai']['default_voices']['question'], question),
-					(self.tts_config['openai']['default_voices']['answer'], answer)
-				]:
-					counter += 1
-					file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
-					response = openai.audio.speech.create(
-						model=self.tts_config['openai']['model'],
-						voice=speaker,
-						input=content
-					)
-					with open(file_name, "wb") as file:
-						file.write(response.content)
-
-					audio_files.append(file_name)
-
-			# Merge all audio files and save the result
-			self.__merge_audio_files(self.temp_audio_dir, output_file)
-
-			# Clean up individual audio files
-			for file in audio_files:
-				os.remove(file)
-			
-			logger.info(f"Audio saved to {output_file}")
-
-		except Exception as e:
-			logger.error(f"Error converting text to speech with OpenAI: {str(e)}")
-			raise
-	
-	def get_or_create_eventloop():
-		try:
-			return asyncio.get_event_loop()
-		except RuntimeError as ex:
-			if "There is no current event loop in thread" in str(ex):
-				loop = asyncio.new_event_loop()
-				asyncio.set_event_loop(loop)
-				return asyncio.get_event_loop()
-
-	import nest_asyncio  # type: ignore
-	get_or_create_eventloop()
-	nest_asyncio.apply()
-
-	def __convert_to_speech_edge(self, text: str, output_file: str) -> None:
-		"""
-		Convert text to speech using Edge TTS.
-
-		Args:
-			text (str): The input text to convert to speech.
-			output_file (str): The path to save the output audio file.
-		"""
-		try:
-			qa_pairs = self.split_qa(text)
-			audio_files = []
-			counter = 0
-
-			async def edge_tts_conversion(text_chunk: str, output_path: str, voice: str):
-				tts = edge_tts.Communicate(text_chunk, voice)
-				await tts.save(output_path)
-				return
-				
-			async def process_qa_pairs(qa_pairs):
-				nonlocal counter
-				tasks = []
-				for question, answer in qa_pairs:
-					for speaker, content in [
-						(self.tts_config['edge']['default_voices']['question'], question),
-						(self.tts_config['edge']['default_voices']['answer'], answer)
-					]:
-						counter += 1
-						file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}"
-						tasks.append(asyncio.ensure_future(edge_tts_conversion(content, file_name, speaker)))
-						audio_files.append(file_name)
-
-				await asyncio.gather(*tasks)
-
-			asyncio.run(process_qa_pairs(qa_pairs))
-
-			# Merge all audio files
-			self.__merge_audio_files(self.temp_audio_dir, output_file)
-
-			# Clean up individual audio files
-			for file in audio_files:
-				os.remove(file)
-			logger.info(f"Audio saved to {output_file}")		
-
-		except Exception as e:
-			logger.error(f"Error converting text to speech with Edge: {str(e)}")
-			raise
-
-
-	def split_qa(self, input_text: str) -> List[Tuple[str, str]]:
-		"""
-		Split the input text into question-answer pairs.
-
-		Args:
-			input_text (str): The input text containing Person1 and Person2 dialogues.
-
-		Returns:
-			List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues.
-		"""
-		# Add ending message to the end of input_text
-		input_text += f"<Person2>{self.ending_message}</Person2>"
-
-		# Regular expression pattern to match Person1 and Person2 dialogues
-		pattern = r'<Person1>(.*?)</Person1>\s*<Person2>(.*?)</Person2>'
-		
-		# Find all matches in the input text
-		matches = re.findall(pattern, input_text, re.DOTALL)
-		
-		# Process the matches to remove extra whitespace and newlines
-		processed_matches = [
-			(
-				' '.join(person1.split()).strip(),
-				' '.join(person2.split()).strip()
-			)
-			for person1, person2 in matches
-		]
-		return processed_matches
-
-	# to be done: Add support for additional tags dynamically given TTS model. Right now it's the intersection of OpenAI/MS Edgeand ElevenLabs supported tags.
-	def clean_tss_markup(self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]) -> str:
-		"""
-		Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
-
-		Args:
-			input_text (str): The input text containing TSS markup tags.
-			additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"].
-
-		Returns:
-			str: Cleaned text with unsupported TSS markup tags removed.
-		"""
-		# List of SSML tags supported by both OpenAI and ElevenLabs
-		supported_tags = [
-			'speak', 'lang', 'p', 'phoneme',
-			's', 'say-as', 'sub'
-		]
-
-		# Append additional tags to the supported tags list
-		supported_tags.extend(additional_tags)
-
-		# Create a pattern that matches any tag not in the supported list
-		pattern = r'</?(?!(?:' + '|'.join(supported_tags) + r')\b)[^>]+>'
-
-		# Remove unsupported tags
-		cleaned_text = re.sub(pattern, '', input_text)
-
-		# Remove any leftover empty lines
-		cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
-
-		# Ensure closing tags for additional tags are preserved
-		for tag in additional_tags:
-			cleaned_text = re.sub(f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)', 
-								  f'<{tag}>\\1</{tag}>', 
-								  cleaned_text, 
-								  flags=re.DOTALL)
-		# Remove '(scratchpad)' from cleaned_text
-		cleaned_text = cleaned_text.replace('(scratchpad)', '')
-
-		return cleaned_text.strip()
-
-def main(seed: int = 42) -> None:
-	"""
-	Main function to test the TextToSpeech class.
-
-	Args:
-		seed (int): Random seed for reproducibility. Defaults to 42.
-	"""
-	try:
-		# Load configuration
-		config = load_config()
-
-		# Read input text from file
-		with open('tests/data/transcript_336aa9f955cd4019bc1287379a5a2820.txt', 'r') as file:
-			input_text = file.read()
-
-		# Test ElevenLabs
-		tts_elevenlabs = TextToSpeech(model='elevenlabs')
-		elevenlabs_output_file = 'tests/data/response_elevenlabs.mp3'
-		tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file)
-		logger.info(f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}")
-
-		# Test OpenAI
-		tts_openai = TextToSpeech(model='openai')
-		openai_output_file = 'tests/data/response_openai.mp3'
-		tts_openai.convert_to_speech(input_text, openai_output_file)
-		logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}")
-
-		# Test OpenAI
-		tts_edge = TextToSpeech(model='edge')
-		edge_output_file = 'tests/data/response_edge.mp3'
-		tts_edge.convert_to_speech(input_text, edge_output_file)
-		logger.info(f"Edge TTS completed. Output saved to {edge_output_file}")
-
-	except Exception as e:
-		logger.error(f"An error occurred during text-to-speech conversion: {str(e)}")
-		raise
-
-if __name__ == "__main__":
-	main(seed=42)
\ No newline at end of file

From 83854a06a6c5df5332b2104f383f37647f90d1b1 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 07:36:01 +0200
Subject: [PATCH 43/49] fix inccorect merge

---
 podcastfy/client.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/podcastfy/client.py b/podcastfy/client.py
index 8ca9c7c9..6a0e14b5 100644
--- a/podcastfy/client.py
+++ b/podcastfy/client.py
@@ -158,9 +158,7 @@ def process_content(
             podcast.build_transcript()
             podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
             logger.info(f"Transcript generated successfully: {random_filename_transcript}")
-            return transcript_filepath
-
-        return None # note: should return the podcast object instead, but for the sake of the tests, we return None
+            return random_filename_transcript
     except Exception as e:
         logger.error(f"An error occurred in the process_content function: {str(e)}")
         raise

From d6679d2159f4ef752aacb5fe2088944f618278b7 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 07:43:27 +0200
Subject: [PATCH 44/49] fix incorrect merge

---
 podcastfy/client.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/podcastfy/client.py b/podcastfy/client.py
index 6a0e14b5..cda6bb52 100644
--- a/podcastfy/client.py
+++ b/podcastfy/client.py
@@ -28,7 +28,6 @@
 )
 from podcastfy.utils.logger import setup_logger
 from typing import List, Optional, Dict, Any
-from podcastfy.client_v2 import process_content_v2 as process_content
 
 logger = setup_logger(__name__)
 

From c6b78760c56f7bd0abde4f9cf82a201855ebd7d1 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 16:52:32 +0200
Subject: [PATCH 45/49] fix attempt

---
 podcastfy/core/audio.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py
index ab6fab77..2591e5d3 100644
--- a/podcastfy/core/audio.py
+++ b/podcastfy/core/audio.py
@@ -1,5 +1,6 @@
 import asyncio
 import atexit
+import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -36,6 +37,7 @@ def __init__(self, tts_backends: Dict[str, TTSBackend], audio_format, n_jobs: in
         self.file_prefix = file_prefix
         self.final_audio: Optional[AudioSegment] = None
         if audio_temp_dir:
+            os.makedirs(audio_temp_dir, exist_ok=True)
             self.temp_dir = Path(audio_temp_dir)
         else:
             self._temp_dir = TemporaryDirectory()

From 1640f32bf4141bc169c846bf6e5df397f13c57d3 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 17:06:15 +0200
Subject: [PATCH 46/49] correct filepaths

---
 podcastfy/client.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/podcastfy/client.py b/podcastfy/client.py
index cda6bb52..48ff400a 100644
--- a/podcastfy/client.py
+++ b/podcastfy/client.py
@@ -143,6 +143,7 @@ def process_content(
         random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}"
         random_filename_mp3 = f"{random_filename_no_suffix}.mp3"
         random_filename_transcript = f"{random_filename_no_suffix}.txt"
+        transcript_file_path = os.path.join(directories["transcripts"], random_filename_transcript)
         if generate_audio:
             podcast.finalize()
 
@@ -150,14 +151,14 @@ def process_content(
             audio_file = os.path.join(
                 directories["audio"], random_filename_mp3
             )
-            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
+            podcast.transcript.export(transcript_file_path)
             podcast.save(filepath=audio_file)
             return audio_file  # note: should return the podcast object instead, but for the sake of the tests, we return the audio file
         else:
             podcast.build_transcript()
-            podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript))
+            podcast.transcript.export(transcript_file_path)
             logger.info(f"Transcript generated successfully: {random_filename_transcript}")
-            return random_filename_transcript
+            return transcript_file_path
     except Exception as e:
         logger.error(f"An error occurred in the process_content function: {str(e)}")
         raise

From 6f480e3a6f6fad7db903d633ea9bfeb00efab149 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 17:42:58 +0200
Subject: [PATCH 47/49] remove dead code

---
 podcastfy/aiengines/llm/gemini_langchain.py | 112 --------------------
 1 file changed, 112 deletions(-)

diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py
index ebd09e1f..0b9084e0 100644
--- a/podcastfy/aiengines/llm/gemini_langchain.py
+++ b/podcastfy/aiengines/llm/gemini_langchain.py
@@ -27,118 +27,6 @@
 logger = logging.getLogger(__name__)
 
 
-
-class OldContentGenerator:
-	# note: to be deleted but stays around few days for reference and troubleshooting
-	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None):
-		"""
-		Initialize the ContentGenerator.
-
-		Args:
-			api_key (str): API key for Google's Generative AI.
-			conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration.
-		"""
-		os.environ["GOOGLE_API_KEY"] = api_key
-		self.config = load_config()
-		self.content_generator_config = self.config.get('content_generator', {})
-
-		# Load default conversation config and update with custom config if provided
-
-		self.config_conversation = load_conversation_config(conversation_config)
-
-		self.llm = ChatGoogleGenerativeAI(
-			model=self.content_generator_config.get('gemini_model', 'gemini-1.5-pro-latest'),
-			temperature=self.config_conversation.get('creativity', 0),
-			max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192),
-		)
-
-		#pick podcastfy prompt from langchain hub
-		self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_'))
-		self.ending_message = self.config.get('text_to_speech')['ending_message']
-
-		self.parser = StrOutputParser()
-
-		self.chain = (self.prompt_template | self.llm | self.parser)
-
-	def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str:
-		"""
-		Generate Q&A content based on input texts.
-
-		Args:
-			input_texts (str): Input texts to generate content from.
-			output_filepath (Optional[str]): Filepath to save the response content. Defaults to None.
-
-		Returns:
-			str: Formatted Q&A content.
-
-		Raises:
-			Exception: If there's an error in generating content.
-		"""
-		assert len(characters) == 2, "The number of characters should be 2 for this implementation"
-		try:
-
-
-			prompt_params = {
-				"input_text": input_texts,
-				"word_count": self.config_conversation.get('word_count'),
-				"conversation_style": ", ".join(self.config_conversation.get('conversation_style', [])),
-				"roles_person1": characters[0].role,
-				"roles_person2": characters[1].role,
-				"dialogue_structure": ", ".join(self.config_conversation.get('dialogue_structure', [])),
-				"podcast_name": self.config_conversation.get('podcast_name'),
-				"podcast_tagline": self.config_conversation.get('podcast_tagline'),
-				"output_language": self.config_conversation.get('output_language'),
-				"engagement_techniques": ", ".join(self.config_conversation.get('engagement_techniques', []))
-			}
-
-			self.response = self.chain.invoke(prompt_params)
-
-			logger.info(f"Content generated successfully")
-
-			if output_filepath:
-				with open(output_filepath, 'w') as file:
-					file.write(self.response)
-				logger.info(f"Response content saved to {output_filepath}")
-
-			return self.response
-		except Exception as e:
-			logger.error(f"Error generating content: {str(e)}")
-			raise
-
-class LLMBackend:
-    def __init__(
-        self,
-        is_local: bool,
-        temperature: float,
-        max_output_tokens: int,
-        model_name: str,
-    ):
-        """
-        Initialize the LLMBackend.
-
-        Args:
-                is_local (bool): Whether to use a local LLM or not.
-                temperature (float): The temperature for text generation.
-                max_output_tokens (int): The maximum number of output tokens.
-                model_name (str): The name of the model to use.
-        """
-        self.is_local = is_local
-        self.temperature = temperature
-        self.max_output_tokens = max_output_tokens
-        self.model_name = model_name
-        self.is_multimodal = not is_local  # Does not assume local LLM is multimodal
-
-        if is_local:
-            self.llm = Llamafile()
-        else:
-            self.llm = ChatGoogleGenerativeAI(
-                model=model_name,
-                temperature=temperature,
-                max_output_tokens=max_output_tokens,
-            )
-
-
-
 class DefaultPodcastifyTranscriptEngine(LLMBackend):
 	def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False):
 		"""

From c5ab289810aafd4a528b52368e06f26837d1e276 Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Fri, 18 Oct 2024 17:44:03 +0200
Subject: [PATCH 48/49] fix empty segments

---
 podcastfy/core/podcast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py
index 3a93f951..2b112679 100644
--- a/podcastfy/core/podcast.py
+++ b/podcastfy/core/podcast.py
@@ -162,7 +162,7 @@ def build_transcript(self) -> None:
         for segment in generated_segments:
             if isinstance(segment, tuple) and len(segment) == 2:
                 speaker, text = segment
-                if speaker.name in self.characters:
+                if speaker.name in self.characters and text.strip():
                     tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {}))
                     segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config))
             else:

From 0b7882a135d65d01c8e96bc11ba717a8eaf1286e Mon Sep 17 00:00:00 2001
From: bruno <martin.bruno.mail@gmail.com>
Date: Sat, 19 Oct 2024 16:56:39 +0200
Subject: [PATCH 49/49] a fix and one improvement

---
 podcastfy/aiengines/tts/tts_backends.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py
index 58be2dc6..83e59b30 100644
--- a/podcastfy/aiengines/tts/tts_backends.py
+++ b/podcastfy/aiengines/tts/tts_backends.py
@@ -53,7 +53,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat
 class OpenAITTS(SyncTTSBackend, TTSConfigMixin):
     name: str = "openai"
 
-    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'):
+    def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'):
         TTSConfigMixin.__init__(self, config_file, name=self.name)
         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
 
@@ -83,6 +83,11 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat
         communicate = edge_tts.Communicate(text, config.voice)
         await communicate.save(str(output_path))
 
+# register
+SyncTTSBackend.register(ElevenLabsTTS)
+AsyncTTSBackend.register(ElevenLabsTTS)
+SyncTTSBackend.register(OpenAITTS)
+AsyncTTSBackend.register(EdgeTTS)