From afd2300ddccc8cea1335edff737d228e0c9a369f Mon Sep 17 00:00:00 2001 From: Tharsis Souza Date: Thu, 10 Oct 2024 22:08:30 -0300 Subject: [PATCH 01/49] small steps --- README.md | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index bcea4e01..345f92fd 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,12 @@ This sample collection is also [available at audio.com](https://audio.com/thatup ## Features ✨ -- Generate engaging, AI-powered conversational content from multiple sources (websites, youtube, and PDFs) -- Customize transcript and audio generation tailored to your needs (e.g. style, structure, length) -- Generate podcasts from pre-existing/edited transcripts -- Support for multiple advanced text-to-speech models (OpenAI and ElevenLabs) for natural-sounding audio -- Seamlessly integrate CLI and Python package for streamlined automated workflows -- Support for multiple languages, enabling global content creation (very experimental, currently!) +- Generate AI-powered conversational content from multiple sources (websites, YouTube, and PDFs) +- Customizable transcript and audio generation (e.g. style, language, structure, length) +- Create podcasts from pre-existing or edited transcripts +- Support for advanced text-to-speech models (OpenAI and ElevenLabs) +- Seamless CLI and Python package integration for automated workflows +- Multi-language support for global content creation (experimental!) ## Updates 🚀 @@ -42,21 +42,16 @@ This sample collection is also [available at audio.com](https://audio.com/thatup ## Quickstart 💻 -### Setup -Before installing, ensure you have Python 3.11 or higher installed on your system. +### Prerequisites +- Python 3.11 or higher +- `$ pip install ffmpeg` (for audio processing) +### Installation 1. Install from PyPI - `$ pip install podcastfy` 2. Set up your [API keys](usage/config.md) -3. Ensure you have ffmpeg installed on your system, required for audio processing -``` -sudo apt update -sudo apt install ffmpeg -``` - ### Python ```python from podcastfy.client import generate_podcast @@ -74,7 +69,7 @@ python -m podcastfy.client --url --url - [CLI](usage/cli.md) -Try [HuggingFace 🤗 space app](https://huggingface.co/spaces/thatupiso/Podcastfy.ai_demo) for a simple use case (URLs -> Audio). WARNING: This UI App was not as thoroughly tested as the Python package. +Experience Podcastfy with our [HuggingFace](https://huggingface.co/spaces/thatupiso/Podcastfy.ai_demo) 🤗 Spaces app for a simple URL-to-Audio demo. (Note: This UI app is less extensively tested than the Python package.) ## Customization 🔧 @@ -82,7 +77,7 @@ Podcastfy offers a range of [Conversation Customization](usage/conversation_cust ## Contributing 🤝 -Contributions are welcome! Please feel free to submit an [Issue](https://github.com/souzatharsis/podcastfy/issues) or a Pull Request. But even more excitingly feel free to fork the repo and create your own app! I am curious about your use cases! Please let me know if I could be of help. +We welcome contributions! Please submit [Issues](https://github.com/souzatharsis/podcastfy/issues) or Pull Requests. Feel free to fork the repo and create your own applications. We're excited to learn about your use cases! ## Example Use Cases 🎧🎶 From fa67e7fb538c232e64701e4a7b9b0f98e5482635 Mon Sep 17 00:00:00 2001 From: bruno Date: Sun, 13 Oct 2024 17:09:46 +0200 Subject: [PATCH 02/49] small steps --- podcastfy/character.py | 37 +++ podcastfy/content_generator.py | 37 ++- podcastfy/core/podcast.py | 458 +++++++++++++++++++++++++++++++++ podcastfy/tts_backends.py | 62 +++++ 4 files changed, 588 insertions(+), 6 deletions(-) create mode 100644 podcastfy/character.py create mode 100644 podcastfy/core/podcast.py create mode 100644 podcastfy/tts_backends.py diff --git a/podcastfy/character.py b/podcastfy/character.py new file mode 100644 index 00000000..92419bf9 --- /dev/null +++ b/podcastfy/character.py @@ -0,0 +1,37 @@ +from typing import Dict, Any, Optional + +from pydantic import BaseModel + + +class TTSConfig(BaseModel): + voice: str + backend: str + extra_args: Dict[str, Any] + +class Character: + """Represents a character in the podcast.""" + + def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""): + # note: in the future the last two arguments are not optional + self.name = name + self.role = role + self.tts_configs = tts_configs + self.default_description_for_llm = default_description_for_llm + self.preferred_tts = next(iter(tts_configs.keys())) # Set first TTS as default + + def set_preferred_tts(self, tts_name: str): + if tts_name not in self.tts_configs: + raise ValueError(f"TTS backend '{tts_name}' not configured for this character") + self.preferred_tts = tts_name + + def to_prompt(self) -> str: + """Convert the character information to a prompt for the LLM.""" + return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}" + + def get_tts_args(self, tts_name: Optional[str] = None) -> Dict[str, Any]: + """Get the TTS arguments for this character.""" + tts_name = tts_name or self.preferred_tts + tts_config = self.tts_configs[tts_name] + return { + "voice": tts_config["voice"], + **tts_config["extra_args"]} diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py index 6f2c7498..e9796c9f 100644 --- a/podcastfy/content_generator.py +++ b/podcastfy/content_generator.py @@ -7,13 +7,16 @@ """ import os -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List #from langchain_google_vertexai import ChatVertexAI from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain import hub + +from podcastfy.character import Character +from podcastfy.core.podcast import LLMBackend from podcastfy.utils.config_conversation import load_conversation_config from podcastfy.utils.config import load_config import logging @@ -51,7 +54,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = self.chain = (self.prompt_template | self.llm | self.parser) - def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None) -> str: + def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str: """ Generate Q&A content based on input texts. @@ -65,6 +68,7 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = Raises: Exception: If there's an error in generating content. """ + assert len(characters) == 2, "The number of characters should be 2 for this implementation" try: @@ -72,8 +76,8 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = "input_text": input_texts, "word_count": self.config_conversation.get('word_count'), "conversation_style": ", ".join(self.config_conversation.get('conversation_style', [])), - "roles_person1": self.config_conversation.get('roles_person1'), - "roles_person2": self.config_conversation.get('roles_person2'), + "roles_person1": characters[0].role, + "roles_person2": characters[1].role, "dialogue_structure": ", ".join(self.config_conversation.get('dialogue_structure', [])), "podcast_name": self.config_conversation.get('podcast_name'), "podcast_tagline": self.config_conversation.get('podcast_tagline'), @@ -95,6 +99,22 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = logger.error(f"Error generating content: {str(e)}") raise + +class DefaultPodcastifyTranscriptEngine(LLMBackend): + def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None): + """ + Initialize the DefaultPodcastifyTranscriptEngine. + + Args: + api_key (str): API key for Google's Generative AI. + conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. + """ + self.content_generator = ContentGenerator(api_key, conversation_config) + + def generate_text(self, input_text: str, characters: List[Character]) -> str: + return self.content_generator.generate_qa_content(input_text, output_filepath=None, characters=characters) + + def main(seed: int = 42) -> None: """ Generate Q&A content based on input text from input_text.txt using the Gemini API. @@ -115,7 +135,7 @@ def main(seed: int = 42) -> None: raise ValueError("GEMINI_API_KEY not found in configuration") # Initialize ContentGenerator - content_generator = ContentGenerator(api_key) + content_generator = DefaultPodcastifyTranscriptEngine(api_key) # Read input text from file input_text = "" @@ -126,7 +146,12 @@ def main(seed: int = 42) -> None: input_text += file.read() + "\n\n" # Generate Q&A content - response = content_generator.generate_qa_content(input_text) + config_conv = load_conversation_config() + characters = [ + Character(name="Speaker 1", role=config_conv.get('roles_person1')), + Character(name="Speaker 2", role=config_conv.get('roles_person2')), + ] + response = content_generator.generate_text(input_text, characters) # Print the generated Q&A content print("Generated Q&A Content:") diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py new file mode 100644 index 00000000..646fbdbf --- /dev/null +++ b/podcastfy/core/podcast.py @@ -0,0 +1,458 @@ +import logging +from abc import ABC, abstractmethod +from enum import Enum +from pathlib import Path +from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, Type, NamedTuple +from pydub import AudioSegment as PydubAudioSegment +from functools import wraps +import asyncio +from concurrent.futures import ThreadPoolExecutor, as_completed +from contextlib import contextmanager + +from podcastfy.character import Character, TTSConfig + +class PodcastState(Enum): + """Enum representing the different states of a podcast during creation.""" + INITIALIZED = 0 # Initial state when the Podcast object is created + TRANSCRIPT_BUILT = 1 # State after the transcript has been generated + AUDIO_SEGMENTS_BUILT = 2 # State after individual audio segments have been created + STITCHED = 3 # Final state after all audio segments have been combined + + +class LLMBackend(ABC): + """Abstract base class for Language Model backends.""" + + @abstractmethod + def generate_text(self, prompt: str, characters: List['Character']) -> List[Tuple[Character, str]]: + """ + Generate text based on a given prompt. + + Args: + prompt (str): The input prompt for text generation. + + Returns: + List[Tuple[str, str]]: A list of tuples containing speaker and text. + """ + pass + + +class SyncTTSBackend(ABC): + """Protocol for synchronous Text-to-Speech backends.""" + + name: str + + @abstractmethod + def text_to_speech(self, text: str, character: 'Character') -> Path: + """ + Convert text to speech synchronously. + + Args: + text (str): The text to convert to speech. + character (Character): The character for which to generate speech. + + Returns: + Path: Path to the generated audio file. + """ + pass + + +class AsyncTTSBackend(ABC): + """Protocol for asynchronous Text-to-Speech backends.""" + + name: str + + @abstractmethod + async def async_text_to_speech(self, text: str, character: 'Character') -> Path: + """ + Convert text to speech asynchronously. + + Args: + text (str): The text to convert to speech. + character (Character): The character for which to generate speech. + + Returns: + Path: Path to the generated audio file. + """ + pass + + +class TranscriptSegment: + """Represents a segment of the podcast transcript.""" + + def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None): + self.text = text + self.speaker = speaker + self.tts_args = tts_args or {} + + +class Transcript: + """Represents the full transcript of a podcast.""" + + def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]): + self.segments = segments + self.metadata = metadata + + def save(self, filepath: str, format: str = "plaintext"): + """Save the transcript to a file.""" + with open(filepath, 'w') as f: + f.write(str(self)) + + def __str__(self) -> str: + """Convert the transcript to a string representation.""" + lines = [] + for segment in self.segments: + lines.append(f"{segment.speaker.name}: {segment.text}") + + metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()]) + + return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines) + + +class AudioSegment: + """Represents an audio segment of the podcast.""" + + def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None): + self.filepath = filepath + self.length_ms = length_ms + self.transcript_segment = transcript_segment + self._audio: Optional[PydubAudioSegment] = None + + @property + def audio(self) -> PydubAudioSegment: + """Lazy-load the audio segment.""" + if self._audio is None: + self._audio = PydubAudioSegment.from_file(self.filepath) + if len(self._audio) != self.length_ms: + raise ValueError( + f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)") + return self._audio + + +class AudioManager: + def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1): + self.tts_backends = tts_backends + self.n_jobs = n_jobs + self.audio_segments = [] + self.final_audio = None + + async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + async def process_segment(segment: TranscriptSegment): + tts_backend = self.tts_backends[segment.speaker.preferred_tts] + audio_file = await tts_backend.async_text_to_speech(segment.text, segment.speaker) + return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment) + + semaphore = asyncio.Semaphore(self.n_jobs) + + async def bounded_process_segment(segment): + async with semaphore: + return await process_segment(segment) + + tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments] + return await asyncio.gather(*tasks) + + def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + def process_segment(segment: TranscriptSegment): + tts_backend = self.tts_backends[segment.speaker.preferred_tts] + audio_file = tts_backend.text_to_speech(segment.text, segment.speaker) + return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment) + + with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: + return list(executor.map(process_segment, transcript.segments)) + + def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + if any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()): + return asyncio.run(self._async_build_audio_segments(transcript)) + else: + return self._sync_build_audio_segments(transcript) + + def stitch_audio_segments(self): + self.final_audio = sum([segment.audio for segment in self.audio_segments]) + + +def podcast_stage(func): + """Decorator to manage podcast stage transitions.""" + + @wraps(func) + def wrapper(self, *args, **kwargs): + current_method = self._next_stage_methods[self.state] + if current_method != func and not self._reworking: + print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.") + return + + try: + result = func(self, *args, **kwargs) + next_state = next((state for state, method in self._next_stage_methods.items() if method == func), None) + self.state = next_state or self.state + return result + except Exception as e: + print(f"Error in {func.__name__}: {str(e)}") + raise + + return wrapper + + +class Podcast: + """Main class for podcast creation and management.""" + + def __init__(self, content: str, llm_backend: LLMBackend, + tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], + characters: List[Character], default_tts_n_jobs: int = 1): + """ + Initialize a new Podcast instance. + + Args: + content (str): The raw content to be processed into a podcast. + llm_backend (LLMBackend): The language model backend for generating the transcript. + tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends. + characters (List[Character]): List of characters participating in the podcast. + default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing. + Defaults to 1. + + Raises: + ValueError: If a character's preferred TTS backend is not available. + """ + self.content = content + self.llm_backend = llm_backend + self.tts_backends = {backend.name: backend for backend in tts_backends} + self.characters = {char.name: char for char in characters} + self.default_tts_n_jobs = default_tts_n_jobs + self.state = PodcastState.INITIALIZED + self._reworking = False + self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs) + + # Initialize attributes with null values + self.transcript = None + self.audio_segments = [] + self.audio = None + + # Define the sequence of methods to be called for each stage + self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = { + PodcastState.INITIALIZED: self.build_transcript, + PodcastState.TRANSCRIPT_BUILT: self.build_audio_segments, + PodcastState.AUDIO_SEGMENTS_BUILT: self.stitch_audio_segments, + } + + @classmethod + def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript], + tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], characters: List[Character], + default_tts_n_jobs: int = 1) -> 'Podcast': + """ + Create a Podcast instance from a pre-existing transcript. + + Args: + transcript (Union[Sequence[Tuple[str, str]], Transcript]): Pre-existing transcript. + tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends. + characters (List[Character]): List of characters participating in the podcast. + default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing. + Defaults to 1. + + Returns: + Podcast: A new Podcast instance with the transcript built and ready for audio generation. + """ + podcast = cls("", None, list(tts_backends.values()), characters, default_tts_n_jobs=default_tts_n_jobs) + if isinstance(transcript, Transcript): + podcast.transcript = transcript + else: + raise ValueError("Transcript must be a Transcript instance") # unimplemented + podcast.state = PodcastState.TRANSCRIPT_BUILT + return podcast + + def reset_to_state(self, state: PodcastState): + """Reset the podcast to a specific state.""" + self.state = state + self.transcript = None if state.value < PodcastState.TRANSCRIPT_BUILT.value else self.transcript + self.audio_segments = [] if state.value < PodcastState.AUDIO_SEGMENTS_BUILT.value else self.audio_segments + self.audio = None if state.value < PodcastState.STITCHED.value else self.audio + + @contextmanager + def rework(self, target_state: PodcastState, auto_finalize: bool = True): + """Context manager for reworking the podcast from a specific state.""" + original_state = self.state + self._reworking = True + + if target_state.value < self.state.value: + print(f"Rewinding from {self.state.name} to {target_state.name}") + self.reset_to_state(target_state) + + try: + yield + finally: + self._reworking = False + if self.state.value < original_state.value: + print( + f"Warning: Podcast is now in an earlier state ({self.state.name}) than before reworking ({original_state.name}). You may want to call finalize() to rebuild.") + if auto_finalize: + self.finalize() + + @podcast_stage + def build_transcript(self) -> None: + """Build the podcast transcript using the LLM backend.""" + character_prompts = "\n\n".join([char.to_prompt() for char in self.characters.values()]) + full_prompt = f"{self.content}\n\nCharacters:\n{character_prompts}" + generated_segments = self.llm_backend.generate_text(full_prompt, list(self.characters.values())) + + segments = [TranscriptSegment(text, speaker, self.characters[speaker]) + for speaker, text in generated_segments if speaker in self.characters] + + self.transcript = Transcript(segments, {"source": "Generated content"}) + + @podcast_stage + def build_audio_segments(self, n_jobs: Optional[int] = None) -> None: + """Build audio segments from the transcript.""" + self.audio_segments = self.audio_manager.create_audio_segments(self.transcript) + + @podcast_stage + def stitch_audio_segments(self) -> None: + """Stitch all audio segments together to form the final podcast audio.""" + self.audio = sum([segment.audio for segment in self.audio_segments]) + + def _build_next_stage(self) -> bool: + """Build the next stage of the podcast.""" + if self.state == PodcastState.STITCHED: + return False + + next_method = self._next_stage_methods[self.state] + next_method() + return True + + def finalize(self) -> None: + """Finalize the podcast by building all remaining stages.""" + while self._build_next_stage(): + pass + + def save(self, filepath: str) -> None: + """Save the finalized podcast audio to a file.""" + if self.state != PodcastState.STITCHED: + raise ValueError("Podcast can only be saved after audio is stitched") + + if self.audio: + self.audio.export(filepath, format="mp3") + else: + raise ValueError("No stitched audio to save") + + def save_transcript(self, filepath: str, format: str = "plaintext") -> None: + """Save the podcast transcript to a file.""" + if self.state < PodcastState.TRANSCRIPT_BUILT: + raise ValueError("Transcript can only be saved after it is built") + + if self.transcript: + self.transcript.save(filepath, format) + else: + raise ValueError("No transcript to save") + + +# Usage example: Step-by-step podcast creation +if __name__ == "__main__": + from tempfile import NamedTemporaryFile + + + class DummyLLMBackend(LLMBackend): + def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[str, str]]: + return [("Host", "Welcome to our podcast!"), ("Guest", "Thanks for having me!")] + + + class DummyTTSBackend(SyncTTSBackend): + def __init__(self, name: str): + self.name = name + + def text_to_speech(self, text: str, character: Character) -> Path: + with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: + PydubAudioSegment.silent(duration=1000).export(temp_file.name, format="mp3") + return Path(temp_file.name) + + + # Define TTS backends + openai_tts = DummyTTSBackend("openai") + elevenlabs_tts = DummyTTSBackend("elevenlabs") + + # Define TTS backends + + # Define characters + host = Character( + name="Host", + role="Podcast host", + tts_configs={ + "openai": {"voice": "en-US-Neural2-F", "backend": "openai", "extra_args": {"speaking_rate": 1.0}}, + "elevenlabs": {"voice": "Rachel", "backend": "elevenlabs", "extra_args": {"stability": 0.5}} + }, + default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly." + ) + guest = Character( + name="Guest", + role="Expert guest", + tts_configs={"openai": {"voice": "en-US-Neural2-D", "backend": "openai", "extra_args": {"pitch": -2.0}}, + "elevenlabs": {"voice": "Antoni", "backend": "elevenlabs", "extra_args": {"stability": 0.8}}}, + default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner." + ) + + # Initialize the podcast + podcast = Podcast( + content=""" + This is a sample content for our podcast. + It includes information from multiple sources that have already been parsed. + """, + llm_backend=DummyLLMBackend(), + tts_backends=[openai_tts, elevenlabs_tts], + characters=[host, guest], + ) + print(f"Initial state: {podcast.state}") + + # Step 1: Build transcript + podcast.build_transcript() + print(f"After building transcript: {podcast.state}") + print(f"Transcript: {podcast.transcript}") + + # Step 2: Build audio segments + podcast.build_audio_segments() + print(f"After building audio segments: {podcast.state}") + print(f"Number of audio segments: {len(podcast.audio_segments)}") + + # Step 3: Stitch audio segments + podcast.stitch_audio_segments() + print(f"After stitching audio: {podcast.state}") + + # Rework example: modify the transcript and rebuild (auto_finalize is True by default) + with podcast.rework(PodcastState.TRANSCRIPT_BUILT): + print(f"Inside rework context, state: {podcast.state}") + podcast.transcript.segments.append( + TranscriptSegment("This is a new segment", "Host", podcast.characters["Host"])) + print("Added new segment to transcript") + + # Rebuild audio segments and stitch + podcast.build_audio_segments() + + print(f"After rework: {podcast.state}") + + # Add a new audio segment (auto_finalize is True by default) + with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: + PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3") + + with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT): + new_segment = AudioSegment(Path(temp_file.name), 500, + TranscriptSegment("New audio segment", "Host", podcast.characters["Host"])) + podcast.audio_segments.insert(0, new_segment) + + # Save the final podcast + podcast.save("./final.mp3") + podcast.save_transcript("./final.txt", format="plaintext") + print("Saved podcast and transcript") + + # Example with pre-existing transcript using from_transcript class method + pre_existing_transcript = [ + ("Host", "Welcome to our podcast created from a pre-existing transcript!"), + ("Guest", "Thank you for having me. I'm excited to be here.") + ] + + podcast_from_transcript = Podcast.from_transcript( + transcript=pre_existing_transcript, + tts_backends=[openai_tts, elevenlabs_tts], + characters=[host, guest] + ) + + print(f"Podcast created from transcript initial state: {podcast_from_transcript.state}") + print(f"Transcript: {podcast_from_transcript.transcript}") + + # Finalize the podcast (this will skip transcript generation and move directly to audio generation) + podcast_from_transcript.finalize() + podcast_from_transcript.save("./from_transcript.mp3") + print("Saved podcast created from transcript") diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py new file mode 100644 index 00000000..08a02e42 --- /dev/null +++ b/podcastfy/tts_backends.py @@ -0,0 +1,62 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, Any +from podcastfy.character import Character + +class TTSBackend(ABC): + @abstractmethod + def text_to_speech(self, text: str, character: Character) -> Path: + """ + Convert text to speech. + + Args: + text (str): The text to convert to speech. + character (Character): The character for which to generate speech. + + Returns: + Path: Path to the generated audio file. + """ + pass + +class ElevenLabsTTS(TTSBackend): + def __init__(self, api_key: str, config: Dict[str, Any]): + self.api_key = api_key + self.config = config + + def text_to_speech(self, text: str, character: Character) -> Path: + # Placeholder for ElevenLabs TTS implementation + voice = character.get_tts_args('elevenlabs').get('voice', self.config['default_voice']) + + print(f"ElevenLabs TTS: Converting text to speech for character {character.name} with voice {voice}") + + # In a real implementation, this would call the ElevenLabs API and return the path to the generated audio file + return Path(f"/tmp/{character.name}_audio.mp3") + +class OpenAITTS(TTSBackend): + def __init__(self, api_key: str, config: Dict[str, Any]): + self.api_key = api_key + self.config = config + + def text_to_speech(self, text: str, character: Character) -> Path: + # Placeholder for OpenAI TTS implementation + voice = character.get_tts_args('openai').get('voice', self.config['default_voice']) + + print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {voice}") + + # In a real implementation, this would call the OpenAI API and return the path to the generated audio file + return Path(f"/tmp/{character.name}_audio.mp3") + +# Example usage: +if __name__ == "__main__": + from podcastfy.utils.config import load_config + + config = load_config() + elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {})) + openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {})) + + dummy_character = Character("John", "host", { + 'elevenlabs': {'voice': 'en-US-JohnNeural'}, + 'openai': {'voice': 'en-US-Neural2-C'} + }, "A friendly podcast host") + + elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character) From 36bb5e9d49af3759aec0c7b6fc50d9562d00b677 Mon Sep 17 00:00:00 2001 From: bruno Date: Sun, 13 Oct 2024 23:42:10 +0200 Subject: [PATCH 03/49] some progress but not yet --- podcastfy/character.py | 15 ++--- podcastfy/tts_backends.py | 135 +++++++++++++++++++++++++++++--------- 2 files changed, 110 insertions(+), 40 deletions(-) diff --git a/podcastfy/character.py b/podcastfy/character.py index 92419bf9..f225ae4e 100644 --- a/podcastfy/character.py +++ b/podcastfy/character.py @@ -2,17 +2,17 @@ from pydantic import BaseModel - -class TTSConfig(BaseModel): +class VoiceConfig(BaseModel): voice: str - backend: str extra_args: Dict[str, Any] +class TTSConfig(VoiceConfig): + backend: str + class Character: """Represents a character in the podcast.""" def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""): - # note: in the future the last two arguments are not optional self.name = name self.role = role self.tts_configs = tts_configs @@ -28,10 +28,7 @@ def to_prompt(self) -> str: """Convert the character information to a prompt for the LLM.""" return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}" - def get_tts_args(self, tts_name: Optional[str] = None) -> Dict[str, Any]: + def get_tts_args(self, tts_name: Optional[str] = None) -> TTSConfig: """Get the TTS arguments for this character.""" tts_name = tts_name or self.preferred_tts - tts_config = self.tts_configs[tts_name] - return { - "voice": tts_config["voice"], - **tts_config["extra_args"]} + return self.tts_configs[tts_name] diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py index 08a02e42..dc53859a 100644 --- a/podcastfy/tts_backends.py +++ b/podcastfy/tts_backends.py @@ -1,62 +1,135 @@ +import os +import uuid from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, Any -from podcastfy.character import Character +from tempfile import TemporaryFile, TemporaryDirectory +from typing import Dict, Any, List, ClassVar +import asyncio + +import openai + +from podcastfy.character import Character, VoiceConfig +import edge_tts +from elevenlabs import client as elevenlabs_client class TTSBackend(ABC): + name: ClassVar[str] = "" + default_voices: ClassVar[List[VoiceConfig]] = [] + + @classmethod + def set_default_voices(cls, voices: List[VoiceConfig]): + """ + Set the default voices for the TTS backend. + """ + cls.default_voices = voices + @abstractmethod - def text_to_speech(self, text: str, character: Character) -> Path: + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: """ Convert text to speech. Args: text (str): The text to convert to speech. character (Character): The character for which to generate speech. + output_path (Path): The path where the audio file should be saved. Returns: - Path: Path to the generated audio file. + Path: Path to the generated audio file (same as output_path). """ pass class ElevenLabsTTS(TTSBackend): - def __init__(self, api_key: str, config: Dict[str, Any]): - self.api_key = api_key - self.config = config - - def text_to_speech(self, text: str, character: Character) -> Path: - # Placeholder for ElevenLabs TTS implementation - voice = character.get_tts_args('elevenlabs').get('voice', self.config['default_voice']) - - print(f"ElevenLabs TTS: Converting text to speech for character {character.name} with voice {voice}") - - # In a real implementation, this would call the ElevenLabs API and return the path to the generated audio file - return Path(f"/tmp/{character.name}_audio.mp3") + name: str = "elevenlabs" + + def __init__(self, api_key: str = None): + self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + # TODO, would be nicer to get a filepath directly from the client + config = character.get_tts_args('elevenlabs') + client = elevenlabs_client.ElevenLabs(api_key=self.api_key) # # client could be reused + content = client.generate( + text=text, + voice=config.voice, + model=config.extra_args.get('model', 'default') + ) + with open(output_path, "wb") as out: + for chunk in content: + if chunk: + out.write(chunk) + return output_path class OpenAITTS(TTSBackend): - def __init__(self, api_key: str, config: Dict[str, Any]): - self.api_key = api_key - self.config = config + name: str = "openai" + def __init__(self, api_key: str): + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + + def ensure_characters_tts_config_is_valid(self, character:Character) -> None: + # TODO: maybe that should be in the ABC class + tts_config = character.tts_configs.get('openai') + if not tts_config: + raise ValueError(f"Character '{character.name}' does not have OpenAI TTS configuration") + # ensure there is a key model in the extra_args + if 'model' not in tts_config.extra_args: + raise ValueError(f"Character '{character.name}' does not have the 'model' key in the OpenAI TTS configuration") - def text_to_speech(self, text: str, character: Character) -> Path: + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + # TODO, would be nicer to get a filepath directly from the client. If not given takes tempdir from the config ? + self.ensure_characters_tts_config_is_valid(character) # Placeholder for OpenAI TTS implementation - voice = character.get_tts_args('openai').get('voice', self.config['default_voice']) - - print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {voice}") - - # In a real implementation, this would call the OpenAI API and return the path to the generated audio file - return Path(f"/tmp/{character.name}_audio.mp3") + config = character.get_tts_args('openai') + + print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice}") + response = openai.audio.speech.create( + model=config.extra_args["model"], + voice=config.voice, + input=text + ) + with open(output_path, "wb") as file: + file.write(response.content) + return output_path + +class EdgeTTS(TTSBackend): + name: str = "edge-tts" + + + def __init__(self): + pass + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + config = character.get_tts_args('edge-tts') + + async def edge_tts_conversion(text: str, output_path: str, voice: str): + communicate = edge_tts.Communicate(text, voice) + await communicate.save(output_path) + + asyncio.run(edge_tts_conversion(text, str(output_path), config.voice)) + + return output_path + + + def ensure_characters_tts_config_is_valid(self, character: Character) -> None: + tts_config = character.tts_configs.get('edge-tts') + if not tts_config: + raise ValueError(f"Character '{character.name}' does not have Edge TTS configuration") # Example usage: if __name__ == "__main__": from podcastfy.utils.config import load_config - + config = load_config() elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {})) openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {})) - + # edge_tts = EdgeTTS() + dummy_character = Character("John", "host", { 'elevenlabs': {'voice': 'en-US-JohnNeural'}, - 'openai': {'voice': 'en-US-Neural2-C'} + 'openai': {'voice': 'en-US-Neural2-C'}, + 'edge-tts': {'voice': 'en-US-ChristopherNeural'} }, "A friendly podcast host") - - elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character) + + output_dir = Path("output") + output_dir.mkdir(exist_ok=True) + output_path = output_dir / f"{dummy_character.name}_{uuid.uuid4().hex}.mp3" + elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character, output_path) From 386c9fc21cbf3c427fe06218feefb6d4a88a500f Mon Sep 17 00:00:00 2001 From: bruno Date: Mon, 14 Oct 2024 12:34:20 +0200 Subject: [PATCH 04/49] update --- podcastfy/aiengines/__init__.py | 0 podcastfy/aiengines/llm/base.py | 22 ++ .../llm/legacy_gemini_langchain.py} | 26 +- podcastfy/aiengines/tts/base.py | 115 +++++++ podcastfy/aiengines/tts/tts_backends.py | 94 ++++++ podcastfy/client_v2.py | 239 ++++++++++++++ podcastfy/core/__init__.py | 0 podcastfy/core/audio.py | 91 ++++++ podcastfy/{ => core}/character.py | 16 +- podcastfy/core/podcast.py | 298 ++++++------------ podcastfy/core/transcript.py | 95 ++++++ podcastfy/core/tts_configs.py | 12 + podcastfy/tts_backends.py | 135 -------- 13 files changed, 796 insertions(+), 347 deletions(-) create mode 100644 podcastfy/aiengines/__init__.py create mode 100644 podcastfy/aiengines/llm/base.py rename podcastfy/{content_generator.py => aiengines/llm/legacy_gemini_langchain.py} (87%) create mode 100644 podcastfy/aiengines/tts/base.py create mode 100644 podcastfy/aiengines/tts/tts_backends.py create mode 100644 podcastfy/client_v2.py create mode 100644 podcastfy/core/__init__.py create mode 100644 podcastfy/core/audio.py rename podcastfy/{ => core}/character.py (73%) create mode 100644 podcastfy/core/transcript.py create mode 100644 podcastfy/core/tts_configs.py delete mode 100644 podcastfy/tts_backends.py diff --git a/podcastfy/aiengines/__init__.py b/podcastfy/aiengines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/podcastfy/aiengines/llm/base.py b/podcastfy/aiengines/llm/base.py new file mode 100644 index 00000000..fe96dcb3 --- /dev/null +++ b/podcastfy/aiengines/llm/base.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod +from typing import List, Tuple + +from podcastfy.core.character import Character + + +class LLMBackend(ABC): + """Abstract base class for Language Model backends.""" + # TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings) + + @abstractmethod + def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: + """ + Generate text based on a given prompt. + + Args: + prompt (str): The input prompt for text generation. + + Returns: + List[Tuple[Character, str]]: A list of tuples containing speaker and text. + """ + pass diff --git a/podcastfy/content_generator.py b/podcastfy/aiengines/llm/legacy_gemini_langchain.py similarity index 87% rename from podcastfy/content_generator.py rename to podcastfy/aiengines/llm/legacy_gemini_langchain.py index e9796c9f..4e08b0af 100644 --- a/podcastfy/content_generator.py +++ b/podcastfy/aiengines/llm/legacy_gemini_langchain.py @@ -7,16 +7,14 @@ """ import os -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any, List, Tuple -#from langchain_google_vertexai import ChatVertexAI from langchain_google_genai import ChatGoogleGenerativeAI -from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain import hub -from podcastfy.character import Character -from podcastfy.core.podcast import LLMBackend +from podcastfy.core.character import Character +from podcastfy.aiengines.llm.base import LLMBackend from podcastfy.utils.config_conversation import load_conversation_config from podcastfy.utils.config import load_config import logging @@ -111,8 +109,20 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = """ self.content_generator = ContentGenerator(api_key, conversation_config) - def generate_text(self, input_text: str, characters: List[Character]) -> str: - return self.content_generator.generate_qa_content(input_text, output_filepath=None, characters=characters) + def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: + content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters) + + # Parse the generated content into the required format + transcript = [] + for line in content.split('\n'): + if ':' in line: + speaker_name, text = line.split(':', 1) + speaker = next((char for char in characters if char.name == speaker_name.strip()), None) + if speaker: + transcript.append((speaker, text.strip())) + + return transcript + def main(seed: int = 42) -> None: @@ -151,7 +161,7 @@ def main(seed: int = 42) -> None: Character(name="Speaker 1", role=config_conv.get('roles_person1')), Character(name="Speaker 2", role=config_conv.get('roles_person2')), ] - response = content_generator.generate_text(input_text, characters) + response = content_generator.generate_transcript(input_text, characters) # Print the generated Q&A content print("Generated Q&A Content:") diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py new file mode 100644 index 00000000..7b88c290 --- /dev/null +++ b/podcastfy/aiengines/tts/base.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, Any, List + +import yaml + +from podcastfy.core.character import Character +from podcastfy.core.tts_configs import TTSConfig + + +class SyncTTSBackend(ABC): + """Protocol for synchronous Text-to-Speech backends.""" + + name: str + + @abstractmethod + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + """ + Convert text to speech synchronously. + + Args: + text (str): The text to convert to speech. + character (Character): The character for which to generate speech. + output_path (Path): The path to save the generated audio file. + + Returns: + Path: The path to the generated audio file. + """ + pass + + +class AsyncTTSBackend(ABC): + """Protocol for asynchronous Text-to-Speech backends.""" + + name: str + + @abstractmethod + async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + """ + Convert text to speech asynchronously. + + Args: + text (str): The text to convert to speech. + character (Character): The character for which to generate speech. + output_path (Path): The path to save the generated audio file. + + Returns: + Path: The path to the generated audio file. + """ + pass +class TTSConfigMixin: + """Mixin class to manage TTS external configurations.""" + + def __init__(self, config_file: str = 'podcastfy/config.yaml', name: str = "") -> None: + # TODO: probably bad config files for final client + self.name = name + self.config_file = config_file + self.default_configs = self._load_default_configs() + self.tts_config_call_count = 0 + self.character_tts_mapping = {} + + def _load_default_configs(self) -> Dict[str, Any]: + with open(self.config_file, 'r') as f: + config = yaml.safe_load(f) + tts_config = config.get('text_to_speech', {}) + return tts_config.get(self.name, {}) + + def get_default_config(self) -> Dict[str, Any]: + return self.default_configs + + def update_default_config(self, new_config: Dict[str, Any]) -> None: + self.default_configs.update(new_config) + + def tts_config_for_character(self, character: Character) -> TTSConfig: + # todo a bit constrained by the fact that the config has just the question and answer fields + if character.name in self.character_tts_mapping: + return self.character_tts_mapping[character.name] + + # Check if the character has a TTS config for this backend + if self.name in character.tts_configs: + tts_config = character.tts_configs[self.name] + else: + # If not, use the default config + default_voices = self.default_configs.get('default_voices', {}) + if self.tts_config_call_count == 0: + voice = default_voices['question'] + else: + voice = default_voices['answer'] + model = self.default_configs.get('model') + self.tts_config_call_count += 1 + + tts_config = TTSConfig( + voice=voice, + backend=self.name, + extra_args={"model": model} if model else {} + ) + + # Merge the default config with the character-specific config + merged_config = TTSConfig( + voice=tts_config.voice or self.default_configs.get('default_voices', {}).get('question' if self.tts_config_call_count == 1 else 'answer', ''), + backend=self.name, + extra_args={**self.default_configs.get('extra_args', {}), **tts_config.extra_args} + ) + + self.character_tts_mapping[character.name] = merged_config + return merged_config + + # This line is no longer needed as we always return a merged config + + def preload_character_tts_mapping(self, characters: List[Character]) -> None: + for character in characters: + self.tts_config_for_character(character) + + def get_character_tts_mapping(self) -> Dict[str, TTSConfig]: + return self.character_tts_mapping diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py new file mode 100644 index 00000000..0b2d389c --- /dev/null +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -0,0 +1,94 @@ +import os +import uuid +from abc import abstractmethod +from pathlib import Path +from tempfile import TemporaryFile, TemporaryDirectory +from typing import Dict, Any, List, ClassVar +import asyncio + +import openai + +import edge_tts +from elevenlabs import client as elevenlabs_client + +from podcastfy.aiengines.tts.base import SyncTTSBackend, TTSConfigMixin, AsyncTTSBackend +from podcastfy.core.character import Character + + +class ElevenLabsTTS(SyncTTSBackend, TTSConfigMixin): + name: str = "elevenlabs" + + def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'): + # TODO: not the right path for final client + TTSConfigMixin.__init__(self, config_file) + self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + config = self.tts_config_for_character(character) + client = elevenlabs_client.ElevenLabs(api_key=self.api_key) # # client could be reused + content = client.generate( + text=text, + voice=config.voice, + model=config.extra_args.get('model', self.get_default_config().get('model', 'default')) + ) + with open(output_path, "wb") as out: + for chunk in content: + if chunk: + out.write(chunk) + return output_path + + +class OpenAITTS(SyncTTSBackend, TTSConfigMixin): + name: str = "openai" + + def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'): + TTSConfigMixin.__init__(self, config_file, name=self.name) + self.api_key = api_key or os.getenv("OPENAI_API_KEY") + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + config = self.tts_config_for_character(character) + + print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice} \n text: {text}") + model = config.extra_args.get('model', self.get_default_config().get('model', 'tts-1')) + response = openai.audio.speech.create( + model=model, + voice=config.voice, + input=text + ) + with open(output_path, "wb") as file: + file.write(response.content) + return output_path + + +class EdgeTTS(AsyncTTSBackend, TTSConfigMixin): + name: str = "edge-tts" + + def __init__(self, config_file: str = 'podcastfy/config.yaml'): + TTSConfigMixin.__init__(self, config_file) + + async def text_to_speech(self, text: str, character: Character, output_path: Path) -> None: + config = self.tts_config_for_character(character) + communicate = edge_tts.Communicate(text, config.voice) + await communicate.save(output_path) + return output_path + + async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + return await self.text_to_speech(text, character, output_path) + + + +# Example usage: +if __name__ == "__main__": + from podcastfy.utils.config import load_config + + config = load_config() + elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY) + openai_tts = OpenAITTS(config.OPENAI_API_KEY) + edge_tts = EdgeTTS() + + dummy_character1 = Character("character1", "host", {}, "A friendly podcast host") + dummy_character2 = Character("character2", "guest", {}, "An expert guest") + + output_dir = Path("output") + output_dir.mkdir(exist_ok=True) + diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py new file mode 100644 index 00000000..81fdd4fc --- /dev/null +++ b/podcastfy/client_v2.py @@ -0,0 +1,239 @@ +import os +import uuid +import typer +from pathlib import Path +from typing import List, Optional, Dict, Any, Union + +from podcastfy.aiengines.llm.legacy_gemini_langchain import DefaultPodcastifyTranscriptEngine +from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS +from podcastfy.core.character import Character +from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend +from podcastfy.core.transcript import Transcript +from podcastfy.content_parser.content_extractor import ContentExtractor +from podcastfy.core.tts_configs import TTSConfig +from podcastfy.utils.config import Config, load_config +from podcastfy.utils.logger import setup_logger + +logger = setup_logger(__name__) + +app = typer.Typer() + +def create_characters(config: Dict[str, Any]) -> List[Character]: + host = Character( + name="Host", + role="Podcast host", + tts_configs={ + "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["question"], backend="openai"), + "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["question"], backend="elevenlabs"), + }, + default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly." + ) + + guest = Character( + name="Guest", + role="Expert guest", + tts_configs={ + "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], backend="openai"), + "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["answer"], backend="elevenlabs"), + }, + default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner." + ) + + return [host, guest] + +def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]: + return [ + OpenAITTS(api_key=config.OPENAI_API_KEY), + ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY), + EdgeTTS() + ] + +def process_links( + links: List[str], + transcript_file: Optional[str] = None, + tts_model: str = "openai", # could be removed now ? + generate_audio: bool = True, + config: Optional[Config] = None, + conversation_config: Optional[Dict[str, Any]] = None +) -> Podcast: + if config is None: + config = load_config() + characters = create_characters(config.config) + tts_backends = create_tts_backends(config) + if transcript_file: + logger.info(f"Using transcript file: {transcript_file}") + transcript = Transcript.load(transcript_file, {char.name: char for char in characters}) + podcast = Podcast.from_transcript(transcript, tts_backends, characters) + else: + logger.info(f"Processing {len(links)} links") + content_extractor = ContentExtractor(config.JINA_API_KEY) + content_generator = DefaultPodcastifyTranscriptEngine(config.GEMINI_API_KEY, conversation_config) + + contents = [content_extractor.extract_content(link) for link in links] + combined_content = "\n\n".join(contents) + + llm_backend = content_generator # Assuming ContentGenerator implements the LLMBackend interface + + podcast = Podcast( + content=combined_content, + llm_backend=llm_backend, + tts_backends=tts_backends, + characters=characters, + ) + + if generate_audio: + podcast.finalize() + else: + podcast.build_transcript() + + return podcast + + +@app.command() +def main( + urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"), + file: typer.FileText = typer.Option(None, "--file", "-f", help="File containing URLs, one per line"), + transcript: typer.FileText = typer.Option(None, "--transcript", "-t", help="Path to a transcript file"), + tts_model: str = typer.Option(None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"), + transcript_only: bool = typer.Option(False, "--transcript-only", help="Generate only a transcript without audio"), + conversation_config: str = typer.Option(None, "--conversation-config", "-cc", help="Path to custom conversation configuration YAML file"), + output_dir: str = typer.Option("./output", "--output-dir", "-o", help="Directory to save output files"), +): + """ + Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. + """ + try: + config = load_config() + main_config = config.config.get('main', {}) + if tts_model is None: + tts_model = main_config.get('default_tts_model', 'openai') + + urls_list = urls or [] + if file: + urls_list.extend([line.strip() for line in file if line.strip()]) + + if not urls_list and not transcript: + raise typer.BadParameter( + "No URLs or transcript provided. Use --url to specify URLs, --file to specify a file containing URLs, or --transcript for a transcript file." + ) + + podcast = process_links( + urls_list, + transcript_file=transcript.name if transcript else None, + tts_model=tts_model, + generate_audio=not transcript_only, + config=config, + conversation_config=conversation_config + ) + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if transcript_only: + transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt" + podcast.export_transcript(str(transcript_file)) + typer.echo(f"Transcript generated successfully: {transcript_file}") + else: + audio_file = output_dir / f"podcast_{uuid.uuid4().hex}.mp3" + podcast.save(str(audio_file)) + transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt" + podcast.export_transcript(str(transcript_file)) + typer.echo(f"Podcast generated successfully using {tts_model} TTS model: {audio_file}") + typer.echo(f"Transcript saved to: {transcript_file}") + + except Exception as e: + typer.echo(f"An error occurred: {str(e)}", err=True) + raise typer.Exit(code=1) + +if __name__ == "__main__": + app() + +def generate_podcast( + urls: Optional[List[str]] = None, + url_file: Optional[str] = None, + transcript_file: Optional[str] = None, + tts_model: Optional[str] = None, + transcript_only: bool = False, + config: Optional[Dict[str, Any]] = None, + conversation_config: Optional[Dict[str, Any]] = None +) -> Podcast: + """ + Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. + + Args: + urls (Optional[List[str]]): List of URLs to process. + url_file (Optional[str]): Path to a file containing URLs, one per line. + transcript_file (Optional[str]): Path to a transcript file. + tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs'). + transcript_only (bool): Generate only a transcript without audio. Defaults to False. + config (Optional[Dict[str, Any]]): User-provided configuration dictionary. + conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary. + + Returns: + Podcast: An instance of the Podcast class representing the generated podcast. + + Example: + >>> from podcastfy.client_v2 import generate_podcast + >>> podcast = generate_podcast( + ... urls=['https://example.com/article1', 'https://example.com/article2'], + ... tts_model='elevenlabs', + ... config={ + ... 'main': { + ... 'default_tts_model': 'elevenlabs' + ... }, + ... 'output_directories': { + ... 'audio': '/custom/path/to/audio', + ... 'transcripts': '/custom/path/to/transcripts' + ... } + ... }, + ... conversation_config={ + ... 'word_count': 150, + ... 'conversation_style': ['informal', 'friendly'], + ... 'podcast_name': 'My Custom Podcast' + ... } + ... ) + >>> podcast.save('/path/to/output.mp3') + >>> podcast.export_transcript('/path/to/transcript.txt') + """ + try: + default_config = load_config() + + if config: + if isinstance(config, dict): + updated_config = Config() + updated_config.configure(**config) + default_config = updated_config + elif isinstance(config, Config): + default_config = config + else: + raise ValueError("Config must be either a dictionary or a Config object") + + main_config = default_config.config.get('main', {}) + + if tts_model is None: + tts_model = main_config.get('default_tts_model', 'openai') + + urls_list = urls or [] + if url_file: + with open(url_file, 'r') as file: + urls_list.extend([line.strip() for line in file if line.strip()]) + + if not urls_list and not transcript_file: + raise ValueError( + "No URLs or transcript provided. Please provide either 'urls', 'url_file', or 'transcript_file'." + ) + + podcast = process_links( + urls_list, + transcript_file=transcript_file, + tts_model=tts_model, + generate_audio=not transcript_only, + config=default_config, + conversation_config=conversation_config + ) + + return podcast + + except Exception as e: + logger.error(f"An error occurred: {str(e)}") + raise \ No newline at end of file diff --git a/podcastfy/core/__init__.py b/podcastfy/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py new file mode 100644 index 00000000..7d4c383e --- /dev/null +++ b/podcastfy/core/audio.py @@ -0,0 +1,91 @@ +import asyncio +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Optional, Dict, Union, List, cast + +from pydub import AudioSegment as PydubAudioSegment + +from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend +from podcastfy.core.transcript import TranscriptSegment, Transcript + + +class AudioSegment: + """Represents an audio segment of the podcast.""" + + def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None) -> None: + self.filepath = filepath + self.length_ms = length_ms + self.transcript_segment = transcript_segment + self._audio: Optional[PydubAudioSegment] = None + + @property + def audio(self) -> PydubAudioSegment: + """Lazy-load the audio segment.""" + if self._audio is None: + self._audio = PydubAudioSegment.from_file(self.filepath) + if len(self._audio) != self.length_ms: + raise ValueError( + f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)") + return self._audio + + +class AudioManager: + def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1) -> None: + self.tts_backends = tts_backends + self.n_jobs = n_jobs + self.audio_segments = [] + self.final_audio: Optional[PydubAudioSegment] = None + self.temp_dir: Optional[Union[str, Path]] = None + + async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + async def process_segment(segment: TranscriptSegment): + tts_backend = self.get_tts_backend(segment) + audio_file = await cast(AsyncTTSBackend, tts_backend).async_text_to_speech( + segment.text, + segment.speaker, + Path(self.temp_dir) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3" + ) + return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment) + + semaphore = asyncio.Semaphore(self.n_jobs) + + async def bounded_process_segment(segment): + async with semaphore: + return await process_segment(segment) + + tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments] + return list(await asyncio.gather(*tasks)) + + def get_tts_backend(self, segment): + if segment.speaker.preferred_tts is None: + # take the first available TTS backend + tts_backend = next(iter(self.tts_backends.values())) + else: + tts_backend = self.tts_backends[segment.speaker.preferred_tts] + # ensure the preferred TTS backend is available + if tts_backend is None: + raise ValueError(f"Preferred TTS backend '{segment.speaker.preferred_tts}' is not available for character '{segment.speaker.name}'") + return tts_backend + + def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + def process_segment(segment: TranscriptSegment): + tts_backend = self.get_tts_backend(segment) + audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech( + segment.text, + segment.speaker, + Path(str(self.temp_dir)) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3" + ) + return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment) + + + with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: + return list(executor.map(process_segment, transcript.segments)) + + def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: + if all(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()): + return asyncio.run(self._async_build_audio_segments(transcript)) + else: + return self._sync_build_audio_segments(transcript) + + def stitch_audio_segments(self) -> None: + self.final_audio = sum([segment.audio for segment in self.audio_segments]) diff --git a/podcastfy/character.py b/podcastfy/core/character.py similarity index 73% rename from podcastfy/character.py rename to podcastfy/core/character.py index f225ae4e..ad6cdc22 100644 --- a/podcastfy/character.py +++ b/podcastfy/core/character.py @@ -1,23 +1,18 @@ -from typing import Dict, Any, Optional +from typing import Dict, Optional -from pydantic import BaseModel +from podcastfy.core.tts_configs import TTSConfig -class VoiceConfig(BaseModel): - voice: str - extra_args: Dict[str, Any] - -class TTSConfig(VoiceConfig): - backend: str class Character: """Represents a character in the podcast.""" - def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, default_description_for_llm: str = ""): + def __init__(self, name: str, role: str, tts_configs: Dict[str, TTSConfig] = {}, + default_description_for_llm: str = ""): self.name = name self.role = role self.tts_configs = tts_configs self.default_description_for_llm = default_description_for_llm - self.preferred_tts = next(iter(tts_configs.keys())) # Set first TTS as default + self.preferred_tts = next(iter(tts_configs.keys()), None) # Set first TTS as default, can be None def set_preferred_tts(self, tts_name: str): if tts_name not in self.tts_configs: @@ -26,6 +21,7 @@ def set_preferred_tts(self, tts_name: str): def to_prompt(self) -> str: """Convert the character information to a prompt for the LLM.""" + #TODO: could be improved by adding more information than roles return f"Character: {self.name}\nRole: {self.role}\n{self.default_description_for_llm.format(name=self.name)}" def get_tts_args(self, tts_name: Optional[str] = None) -> TTSConfig: diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 646fbdbf..06a5e47c 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -1,15 +1,19 @@ -import logging -from abc import ABC, abstractmethod from enum import Enum from pathlib import Path -from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, Type, NamedTuple +from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, cast +from tempfile import TemporaryDirectory +import atexit from pydub import AudioSegment as PydubAudioSegment from functools import wraps -import asyncio -from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import contextmanager -from podcastfy.character import Character, TTSConfig +from podcastfy.aiengines.llm.base import LLMBackend +from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend +from podcastfy.core.audio import AudioSegment, AudioManager +from podcastfy.core.character import Character +from podcastfy.core.transcript import TranscriptSegment, Transcript +from podcastfy.core.tts_configs import TTSConfig + class PodcastState(Enum): """Enum representing the different states of a podcast during creation.""" @@ -19,170 +23,26 @@ class PodcastState(Enum): STITCHED = 3 # Final state after all audio segments have been combined -class LLMBackend(ABC): - """Abstract base class for Language Model backends.""" - - @abstractmethod - def generate_text(self, prompt: str, characters: List['Character']) -> List[Tuple[Character, str]]: - """ - Generate text based on a given prompt. - - Args: - prompt (str): The input prompt for text generation. - - Returns: - List[Tuple[str, str]]: A list of tuples containing speaker and text. - """ - pass - - -class SyncTTSBackend(ABC): - """Protocol for synchronous Text-to-Speech backends.""" - - name: str - - @abstractmethod - def text_to_speech(self, text: str, character: 'Character') -> Path: - """ - Convert text to speech synchronously. - - Args: - text (str): The text to convert to speech. - character (Character): The character for which to generate speech. - - Returns: - Path: Path to the generated audio file. - """ - pass - - -class AsyncTTSBackend(ABC): - """Protocol for asynchronous Text-to-Speech backends.""" - - name: str - - @abstractmethod - async def async_text_to_speech(self, text: str, character: 'Character') -> Path: - """ - Convert text to speech asynchronously. - - Args: - text (str): The text to convert to speech. - character (Character): The character for which to generate speech. - - Returns: - Path: Path to the generated audio file. - """ - pass - - -class TranscriptSegment: - """Represents a segment of the podcast transcript.""" - - def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None): - self.text = text - self.speaker = speaker - self.tts_args = tts_args or {} - - -class Transcript: - """Represents the full transcript of a podcast.""" - - def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]): - self.segments = segments - self.metadata = metadata - - def save(self, filepath: str, format: str = "plaintext"): - """Save the transcript to a file.""" - with open(filepath, 'w') as f: - f.write(str(self)) - - def __str__(self) -> str: - """Convert the transcript to a string representation.""" - lines = [] - for segment in self.segments: - lines.append(f"{segment.speaker.name}: {segment.text}") - - metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()]) - - return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines) - - -class AudioSegment: - """Represents an audio segment of the podcast.""" - - def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None): - self.filepath = filepath - self.length_ms = length_ms - self.transcript_segment = transcript_segment - self._audio: Optional[PydubAudioSegment] = None - - @property - def audio(self) -> PydubAudioSegment: - """Lazy-load the audio segment.""" - if self._audio is None: - self._audio = PydubAudioSegment.from_file(self.filepath) - if len(self._audio) != self.length_ms: - raise ValueError( - f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)") - return self._audio - - -class AudioManager: - def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1): - self.tts_backends = tts_backends - self.n_jobs = n_jobs - self.audio_segments = [] - self.final_audio = None - - async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - async def process_segment(segment: TranscriptSegment): - tts_backend = self.tts_backends[segment.speaker.preferred_tts] - audio_file = await tts_backend.async_text_to_speech(segment.text, segment.speaker) - return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment) - - semaphore = asyncio.Semaphore(self.n_jobs) - - async def bounded_process_segment(segment): - async with semaphore: - return await process_segment(segment) - - tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments] - return await asyncio.gather(*tasks) - - def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - def process_segment(segment: TranscriptSegment): - tts_backend = self.tts_backends[segment.speaker.preferred_tts] - audio_file = tts_backend.text_to_speech(segment.text, segment.speaker) - return AudioSegment(audio_file, len(PydubAudioSegment.from_file(audio_file)), segment) - - with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: - return list(executor.map(process_segment, transcript.segments)) - - def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - if any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()): - return asyncio.run(self._async_build_audio_segments(transcript)) - else: - return self._sync_build_audio_segments(transcript) - - def stitch_audio_segments(self): - self.final_audio = sum([segment.audio for segment in self.audio_segments]) - - def podcast_stage(func): """Decorator to manage podcast stage transitions.""" + @wraps(func) + def probably_same_func(method, func): + return method.__func__.__name__ == func.__name__ + @wraps(func) def wrapper(self, *args, **kwargs): current_method = self._next_stage_methods[self.state] - if current_method != func and not self._reworking: + print(f"Executing {func.__name__} in state {self.state.name}") + if not probably_same_func(current_method, func) and not self._reworking: print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.") - return + raise Exception(f"Cannot execute {func.__name__} in current state {self.state.name}") try: result = func(self, *args, **kwargs) - next_state = next((state for state, method in self._next_stage_methods.items() if method == func), None) + next_state = PodcastState(self.state.value + 1) self.state = next_state or self.state + print(f"Transitioned to state {self.state.name}") return result except Exception as e: print(f"Error in {func.__name__}: {str(e)}") @@ -195,15 +55,18 @@ class Podcast: """Main class for podcast creation and management.""" def __init__(self, content: str, llm_backend: LLMBackend, - tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], - characters: List[Character], default_tts_n_jobs: int = 1): + tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], audio_temp_dir: Optional[Union[str, Path]] = None, + characters: Optional[List[Character]] = None, + default_tts_n_jobs: int = 1) -> None: """ Initialize a new Podcast instance. Args: content (str): The raw content to be processed into a podcast. llm_backend (LLMBackend): The language model backend for generating the transcript. - tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends. + tts_backends (List[Union[SyncTTSBackend, AsyncTTSBackend]]): List of available TTS backends. + audio_temp_dir (Optional[str]): Path to a temporary directory for audio files. If None, a temporary + directory will be created. characters (List[Character]): List of characters participating in the podcast. default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing. Defaults to 1. @@ -213,17 +76,25 @@ def __init__(self, content: str, llm_backend: LLMBackend, """ self.content = content self.llm_backend = llm_backend - self.tts_backends = {backend.name: backend for backend in tts_backends} - self.characters = {char.name: char for char in characters} + self.tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]] = {backend.name: backend for backend in tts_backends} + self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])} self.default_tts_n_jobs = default_tts_n_jobs self.state = PodcastState.INITIALIZED self._reworking = False + + if audio_temp_dir: + self.temp_dir = Path(audio_temp_dir) + else: + self._temp_dir = TemporaryDirectory() + self.temp_dir = Path(self._temp_dir.name) + atexit.register(self._temp_dir.cleanup) self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs) + self.audio_manager.temp_dir = self.temp_dir # Initialize attributes with null values - self.transcript = None - self.audio_segments = [] - self.audio = None + self.transcript: Optional[Transcript] = None + self.audio_segments: List[AudioSegment] = [] + self.audio: Optional[PydubAudioSegment] = None # Define the sequence of methods to be called for each stage self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = { @@ -232,10 +103,14 @@ def __init__(self, content: str, llm_backend: LLMBackend, PodcastState.AUDIO_SEGMENTS_BUILT: self.stitch_audio_segments, } + def __del__(self) -> None: + if hasattr(self, '_temp_dir'): + self._temp_dir.cleanup() + @classmethod def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript], - tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], characters: List[Character], - default_tts_n_jobs: int = 1) -> 'Podcast': + tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], + characters: List[Character], default_tts_n_jobs: int = 1) -> 'Podcast': """ Create a Podcast instance from a pre-existing transcript. @@ -249,16 +124,16 @@ def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript Returns: Podcast: A new Podcast instance with the transcript built and ready for audio generation. """ - podcast = cls("", None, list(tts_backends.values()), characters, default_tts_n_jobs=default_tts_n_jobs) if isinstance(transcript, Transcript): + podcast = cls("", cast(LLMBackend, None), tts_backends, characters=characters, default_tts_n_jobs=default_tts_n_jobs) podcast.transcript = transcript else: raise ValueError("Transcript must be a Transcript instance") # unimplemented podcast.state = PodcastState.TRANSCRIPT_BUILT return podcast - def reset_to_state(self, state: PodcastState): - """Reset the podcast to a specific state.""" + def reset_to_state(self, state: PodcastState) -> None: + """Reset the podcast to a specific state. """ self.state = state self.transcript = None if state.value < PodcastState.TRANSCRIPT_BUILT.value else self.transcript self.audio_segments = [] if state.value < PodcastState.AUDIO_SEGMENTS_BUILT.value else self.audio_segments @@ -270,6 +145,12 @@ def rework(self, target_state: PodcastState, auto_finalize: bool = True): original_state = self.state self._reworking = True + if target_state == PodcastState.INITIALIZED and self.llm_backend is None: + raise ValueError("Cannot rewind to INITIALIZED state without an LLM backend.") + + if target_state.value < PodcastState.TRANSCRIPT_BUILT.value and self.llm_backend is None: + raise ValueError("Cannot rewind past TRANSCRIPT_BUILT state without an LLM backend.") + if target_state.value < self.state.value: print(f"Rewinding from {self.state.name} to {target_state.name}") self.reset_to_state(target_state) @@ -289,17 +170,27 @@ def build_transcript(self) -> None: """Build the podcast transcript using the LLM backend.""" character_prompts = "\n\n".join([char.to_prompt() for char in self.characters.values()]) full_prompt = f"{self.content}\n\nCharacters:\n{character_prompts}" - generated_segments = self.llm_backend.generate_text(full_prompt, list(self.characters.values())) + generated_segments = self.llm_backend.generate_transcript(full_prompt, list(self.characters.values())) - segments = [TranscriptSegment(text, speaker, self.characters[speaker]) - for speaker, text in generated_segments if speaker in self.characters] + segments = [] + for segment in generated_segments: + if isinstance(segment, tuple) and len(segment) == 2: + speaker, text = segment + if speaker.name in self.characters: + tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {})) + segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config)) + # If the segment doesn't match the expected format, we'll skip it self.transcript = Transcript(segments, {"source": "Generated content"}) @podcast_stage - def build_audio_segments(self, n_jobs: Optional[int] = None) -> None: + def build_audio_segments(self) -> None: """Build audio segments from the transcript.""" - self.audio_segments = self.audio_manager.create_audio_segments(self.transcript) + if self.transcript is not None: + self.audio_segments = self.audio_manager.create_audio_segments(self.transcript) + else: + print("Error: Transcript is None") + raise ValueError("Transcript must be built before creating audio segments") @podcast_stage def stitch_audio_segments(self) -> None: @@ -330,16 +221,34 @@ def save(self, filepath: str) -> None: else: raise ValueError("No stitched audio to save") - def save_transcript(self, filepath: str, format: str = "plaintext") -> None: + def export_transcript(self, filepath: str, format_: str = "plaintext") -> None: """Save the podcast transcript to a file.""" if self.state < PodcastState.TRANSCRIPT_BUILT: raise ValueError("Transcript can only be saved after it is built") if self.transcript: - self.transcript.save(filepath, format) + self.transcript.export(filepath, format_) else: raise ValueError("No transcript to save") + def dump_transcript(self, filepath: str) -> None: + """Dump the podcast transcript to a JSON file.""" + if self.state < PodcastState.TRANSCRIPT_BUILT: + raise ValueError("Transcript can only be dumped after it is built") + + if self.transcript: + self.transcript.dump(filepath) + else: + raise ValueError("No transcript to dump") + + @classmethod + def load_transcript(cls, filepath: str, tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], + characters: List[Character]) -> 'Podcast': + """Load a podcast from a transcript JSON file.""" + character_dict = {char.name: char for char in characters} + transcript = Transcript.load(filepath, character_dict) + return cls.from_transcript(transcript, tts_backends, characters) + # Usage example: Step-by-step podcast creation if __name__ == "__main__": @@ -347,18 +256,18 @@ def save_transcript(self, filepath: str, format: str = "plaintext") -> None: class DummyLLMBackend(LLMBackend): - def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[str, str]]: - return [("Host", "Welcome to our podcast!"), ("Guest", "Thanks for having me!")] + def generate_text(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: + return [(characters[0], "Welcome to our podcast!"), (characters[1], "Thanks for having me!")] class DummyTTSBackend(SyncTTSBackend): def __init__(self, name: str): self.name = name - def text_to_speech(self, text: str, character: Character) -> Path: - with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: - PydubAudioSegment.silent(duration=1000).export(temp_file.name, format="mp3") - return Path(temp_file.name) + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + audio = PydubAudioSegment.silent(duration=1000) + audio.export(str(output_path), format="mp3") + return output_path # Define TTS backends @@ -366,22 +275,23 @@ def text_to_speech(self, text: str, character: Character) -> Path: elevenlabs_tts = DummyTTSBackend("elevenlabs") # Define TTS backends - - # Define characters host = Character( name="Host", role="Podcast host", tts_configs={ - "openai": {"voice": "en-US-Neural2-F", "backend": "openai", "extra_args": {"speaking_rate": 1.0}}, - "elevenlabs": {"voice": "Rachel", "backend": "elevenlabs", "extra_args": {"stability": 0.5}} + "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}), + "elevenlabs": TTSConfig(voice="Rachel", backend="elevenlabs", extra_args={"stability": 0.5}) }, default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly." ) + guest = Character( name="Guest", role="Expert guest", - tts_configs={"openai": {"voice": "en-US-Neural2-D", "backend": "openai", "extra_args": {"pitch": -2.0}}, - "elevenlabs": {"voice": "Antoni", "backend": "elevenlabs", "extra_args": {"stability": 0.8}}}, + tts_configs={ + "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}), + "elevenlabs": TTSConfig(voice="Antoni", backend="elevenlabs", extra_args={"stability": 0.8}) + }, default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner." ) @@ -415,7 +325,7 @@ def text_to_speech(self, text: str, character: Character) -> Path: with podcast.rework(PodcastState.TRANSCRIPT_BUILT): print(f"Inside rework context, state: {podcast.state}") podcast.transcript.segments.append( - TranscriptSegment("This is a new segment", "Host", podcast.characters["Host"])) + TranscriptSegment("This is a new segment", podcast.characters["Host"])) print("Added new segment to transcript") # Rebuild audio segments and stitch @@ -429,12 +339,12 @@ def text_to_speech(self, text: str, character: Character) -> Path: with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT): new_segment = AudioSegment(Path(temp_file.name), 500, - TranscriptSegment("New audio segment", "Host", podcast.characters["Host"])) + TranscriptSegment("New audio segment", podcast.characters["Host"])) podcast.audio_segments.insert(0, new_segment) # Save the final podcast podcast.save("./final.mp3") - podcast.save_transcript("./final.txt", format="plaintext") + podcast.export_transcript("./final.txt", format_="plaintext") print("Saved podcast and transcript") # Example with pre-existing transcript using from_transcript class method diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py new file mode 100644 index 00000000..952fa2be --- /dev/null +++ b/podcastfy/core/transcript.py @@ -0,0 +1,95 @@ +import json +import re +from typing import Optional, Dict, Any, List, Tuple + +from podcastfy.core.character import Character + + +class TranscriptSegment: + def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None) -> None: + self.text = text + self.speaker = speaker + self.tts_args = tts_args or {} + + def to_dict(self) -> Dict[str, Any]: + return { + "text": self.text, + "speaker": self.speaker.name, + "tts_args": self.tts_args + } + + @classmethod + def from_dict(cls, data: Dict[str, Any], characters: Dict[str, Character]) -> 'TranscriptSegment': + return cls( + text=data['text'], + speaker=characters[data['speaker']], + tts_args=data.get('tts_args', {}) + ) + + +class Transcript: + def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]) -> None: + self.segments = segments + self.metadata = metadata + + def export(self, filepath: str, format_: str = "plaintext") -> None: + """Export the transcript to a file.""" + with open(filepath, 'w') as f: + if format_ == "plaintext": + f.write(str(self)) + elif format_ == "json": + json.dump(self.to_dict(), f, indent=2) + else: + raise ValueError(f"Unsupported format: {format_}") + + def dump(self, filepath: str) -> None: + """Dump the transcript to a JSON file.""" + with open(filepath, 'w') as f: + json.dump(self.to_dict(), f, indent=2) + + @staticmethod + def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]: + pattern = r'\s*(.*?)\s*' + matches = re.findall(pattern, content, re.DOTALL) + return [('Person' + person_num, text) for person_num, text in matches] + + @classmethod + def load(cls, filepath: str, characters: Dict[str, Character]) -> 'Transcript': + """Load a transcript from a JSON file.""" + with open(filepath, 'r') as f: + content = f.read() + + try: + data = json.loads(content) + segments = [TranscriptSegment.from_dict(seg, characters) for seg in data['segments']] + except json.JSONDecodeError: + # If JSON parsing fails, assume it's a legacy transcript + parsed_content = cls._parse_legacy_transcript(content) + segments = [] + for speaker, text in parsed_content: + if speaker in characters: + character = characters[speaker] + else: + # Create a new character if it doesn't exist + character = Character(speaker, f"Character {speaker}", {}) + characters[speaker] = character + segments.append(TranscriptSegment(text, character)) + + data = {'segments': segments, 'metadata': {}} + return cls(segments, data['metadata']) + + def to_dict(self) -> Dict[str, Any]: + return { + "segments": [segment.to_dict() for segment in self.segments], + "metadata": self.metadata + } + + def __str__(self) -> str: + """Convert the transcript to a string representation.""" + lines = [] + for segment in self.segments: + lines.append(f"{segment.speaker.name}: {segment.text}") + + metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()]) + + return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines) diff --git a/podcastfy/core/tts_configs.py b/podcastfy/core/tts_configs.py new file mode 100644 index 00000000..c46ed25c --- /dev/null +++ b/podcastfy/core/tts_configs.py @@ -0,0 +1,12 @@ +from typing import Dict, Any + +from pydantic import BaseModel + + +class VoiceConfig(BaseModel): + voice: str + extra_args: Dict[str, Any] = {} + + +class TTSConfig(VoiceConfig): + backend: str diff --git a/podcastfy/tts_backends.py b/podcastfy/tts_backends.py deleted file mode 100644 index dc53859a..00000000 --- a/podcastfy/tts_backends.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -import uuid -from abc import ABC, abstractmethod -from pathlib import Path -from tempfile import TemporaryFile, TemporaryDirectory -from typing import Dict, Any, List, ClassVar -import asyncio - -import openai - -from podcastfy.character import Character, VoiceConfig -import edge_tts -from elevenlabs import client as elevenlabs_client - -class TTSBackend(ABC): - name: ClassVar[str] = "" - default_voices: ClassVar[List[VoiceConfig]] = [] - - @classmethod - def set_default_voices(cls, voices: List[VoiceConfig]): - """ - Set the default voices for the TTS backend. - """ - cls.default_voices = voices - - @abstractmethod - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - """ - Convert text to speech. - - Args: - text (str): The text to convert to speech. - character (Character): The character for which to generate speech. - output_path (Path): The path where the audio file should be saved. - - Returns: - Path: Path to the generated audio file (same as output_path). - """ - pass - -class ElevenLabsTTS(TTSBackend): - name: str = "elevenlabs" - - def __init__(self, api_key: str = None): - self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") - - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - # TODO, would be nicer to get a filepath directly from the client - config = character.get_tts_args('elevenlabs') - client = elevenlabs_client.ElevenLabs(api_key=self.api_key) # # client could be reused - content = client.generate( - text=text, - voice=config.voice, - model=config.extra_args.get('model', 'default') - ) - with open(output_path, "wb") as out: - for chunk in content: - if chunk: - out.write(chunk) - return output_path - -class OpenAITTS(TTSBackend): - name: str = "openai" - def __init__(self, api_key: str): - self.api_key = api_key or os.getenv("OPENAI_API_KEY") - - def ensure_characters_tts_config_is_valid(self, character:Character) -> None: - # TODO: maybe that should be in the ABC class - tts_config = character.tts_configs.get('openai') - if not tts_config: - raise ValueError(f"Character '{character.name}' does not have OpenAI TTS configuration") - # ensure there is a key model in the extra_args - if 'model' not in tts_config.extra_args: - raise ValueError(f"Character '{character.name}' does not have the 'model' key in the OpenAI TTS configuration") - - - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - # TODO, would be nicer to get a filepath directly from the client. If not given takes tempdir from the config ? - self.ensure_characters_tts_config_is_valid(character) - # Placeholder for OpenAI TTS implementation - config = character.get_tts_args('openai') - - print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice}") - response = openai.audio.speech.create( - model=config.extra_args["model"], - voice=config.voice, - input=text - ) - with open(output_path, "wb") as file: - file.write(response.content) - return output_path - -class EdgeTTS(TTSBackend): - name: str = "edge-tts" - - - def __init__(self): - pass - - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - config = character.get_tts_args('edge-tts') - - async def edge_tts_conversion(text: str, output_path: str, voice: str): - communicate = edge_tts.Communicate(text, voice) - await communicate.save(output_path) - - asyncio.run(edge_tts_conversion(text, str(output_path), config.voice)) - - return output_path - - - def ensure_characters_tts_config_is_valid(self, character: Character) -> None: - tts_config = character.tts_configs.get('edge-tts') - if not tts_config: - raise ValueError(f"Character '{character.name}' does not have Edge TTS configuration") - -# Example usage: -if __name__ == "__main__": - from podcastfy.utils.config import load_config - - config = load_config() - elevenlabs_tts = ElevenLabsTTS(config.ELEVENLABS_API_KEY, config.get('text_to_speech', {}).get('elevenlabs', {})) - openai_tts = OpenAITTS(config.OPENAI_API_KEY, config.get('text_to_speech', {}).get('openai', {})) - # edge_tts = EdgeTTS() - - dummy_character = Character("John", "host", { - 'elevenlabs': {'voice': 'en-US-JohnNeural'}, - 'openai': {'voice': 'en-US-Neural2-C'}, - 'edge-tts': {'voice': 'en-US-ChristopherNeural'} - }, "A friendly podcast host") - - output_dir = Path("output") - output_dir.mkdir(exist_ok=True) - output_path = output_dir / f"{dummy_character.name}_{uuid.uuid4().hex}.mp3" - elevenlabs_tts.text_to_speech("Hello, welcome to the podcast!", dummy_character, output_path) From 7b625c5614e12e2499f2521d70c6178fc00fa27c Mon Sep 17 00:00:00 2001 From: bruno Date: Tue, 15 Oct 2024 16:09:54 +0200 Subject: [PATCH 05/49] black and one renaming --- ...emini_langchain.py => gemini_langchain.py} | 0 podcastfy/client.py | 246 ++++---- podcastfy/client_v2.py | 100 +++- podcastfy/core/audio.py | 2 +- podcastfy/text_to_speech.py | 523 +++++++++--------- tests/test_content_parser.py | 2 +- tests/test_genai_podcast.py | 26 +- tests/test_generate_podcast.py | 173 +++--- 8 files changed, 586 insertions(+), 486 deletions(-) rename podcastfy/aiengines/llm/{legacy_gemini_langchain.py => gemini_langchain.py} (100%) diff --git a/podcastfy/aiengines/llm/legacy_gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py similarity index 100% rename from podcastfy/aiengines/llm/legacy_gemini_langchain.py rename to podcastfy/aiengines/llm/gemini_langchain.py diff --git a/podcastfy/client.py b/podcastfy/client.py index b5b297cc..a1b6c727 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -12,7 +12,10 @@ from podcastfy.content_generator import ContentGenerator from podcastfy.text_to_speech import TextToSpeech from podcastfy.utils.config import Config, load_config -from podcastfy.utils.config_conversation import ConversationConfig, load_conversation_config +from podcastfy.utils.config_conversation import ( + ConversationConfig, + load_conversation_config, +) from podcastfy.utils.logger import setup_logger from typing import List, Optional, Dict, Any import copy @@ -23,8 +26,14 @@ app = typer.Typer() -def process_links(links, transcript_file=None, tts_model="openai", generate_audio=True, config=None, - conversation_config: Optional[Dict[str, Any]] = None): +def process_links( + links, + transcript_file=None, + tts_model="openai", + generate_audio=True, + config=None, + conversation_config: Optional[Dict[str, Any]] = None, +): """ Process a list of links or a transcript file to generate a podcast or transcript. @@ -49,8 +58,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi else: logger.info(f"Processing {len(links)} links") content_extractor = ContentExtractor(config.JINA_API_KEY) - content_generator = ContentGenerator(api_key=config.GEMINI_API_KEY, - conversation_config=conversation_config) + content_generator = ContentGenerator( + api_key=config.GEMINI_API_KEY, conversation_config=conversation_config + ) # Extract content from links contents = [content_extractor.extract_content(link) for link in links] @@ -60,7 +70,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi # Generate Q&A content random_filename = f"transcript_{uuid.uuid4().hex}.txt" - transcript_filepath = os.path.join(config.get('output_directories')['transcripts'], random_filename) + transcript_filepath = os.path.join( + config.get("output_directories")["transcripts"], random_filename + ) qa_content = content_generator.generate_qa_content( combined_content, output_filepath=transcript_filepath ) @@ -71,7 +83,9 @@ def process_links(links, transcript_file=None, tts_model="openai", generate_audi ) # Convert text to speech using the specified model random_filename = f"podcast_{uuid.uuid4().hex}.mp3" - audio_file = os.path.join(config.get('output_directories')['audio'], random_filename) + audio_file = os.path.join( + config.get("output_directories")["audio"], random_filename + ) text_to_speech.convert_to_speech(qa_content, audio_file) logger.info(f"Podcast generated successfully using {tts_model} TTS model") return audio_file @@ -100,8 +114,10 @@ def main( False, "--transcript-only", help="Generate only a transcript without audio" ), conversation_config: str = typer.Option( - None, "--conversation-config", "-cc", - help="Path to custom conversation configuration YAML file" + None, + "--conversation-config", + "-cc", + help="Path to custom conversation configuration YAML file", ), ): """ @@ -110,11 +126,10 @@ def main( try: config = load_config() - main_config = config.get('main', {}) + main_config = config.get("main", {}) # Use default TTS model from config if not specified if tts_model is None: - tts_model = main_config.get('default_tts_model', 'openai') - + tts_model = main_config.get("default_tts_model", "openai") if transcript: final_output = process_links( @@ -123,7 +138,7 @@ def main( tts_model=tts_model, generate_audio=not transcript_only, conversation_config=conversation_config, - config=config + config=config, ) else: urls_list = urls or [] @@ -136,11 +151,11 @@ def main( ) final_output = process_links( - urls_list, - tts_model=tts_model, + urls_list, + tts_model=tts_model, generate_audio=not transcript_only, config=config, - conversation_config=conversation_config + conversation_config=conversation_config, ) if transcript_only: @@ -157,106 +172,107 @@ def main( if __name__ == "__main__": app() - def generate_podcast( - urls: Optional[List[str]] = None, - url_file: Optional[str] = None, - transcript_file: Optional[str] = None, - tts_model: Optional[str] = None, - transcript_only: bool = False, - config: Optional[Dict[str, Any]] = None, - conversation_config: Optional[Dict[str, Any]] = None + urls: Optional[List[str]] = None, + url_file: Optional[str] = None, + transcript_file: Optional[str] = None, + tts_model: Optional[str] = None, + transcript_only: bool = False, + config: Optional[Dict[str, Any]] = None, + conversation_config: Optional[Dict[str, Any]] = None, ) -> Optional[str]: - """ - Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. - - Args: - urls (Optional[List[str]]): List of URLs to process. - url_file (Optional[str]): Path to a file containing URLs, one per line. - transcript_file (Optional[str]): Path to a transcript file. - tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs'). - transcript_only (bool): Generate only a transcript without audio. Defaults to False. - config (Optional[Dict[str, Any]]): User-provided configuration dictionary. - conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary. - - Returns: - Optional[str]: Path to the final podcast audio file, or None if only generating a transcript. - - Example: - >>> from podcastfy.client import generate_podcast - >>> result = generate_podcast( - ... urls=['https://example.com/article1', 'https://example.com/article2'], - ... tts_model='elevenlabs', - ... config={ - ... 'main': { - ... 'default_tts_model': 'elevenlabs' - ... }, - ... 'output_directories': { - ... 'audio': '/custom/path/to/audio', - ... 'transcripts': '/custom/path/to/transcripts' - ... } - ... }, - ... conversation_config={ - ... 'word_count': 150, - ... 'conversation_style': ['informal', 'friendly'], - ... 'podcast_name': 'My Custom Podcast' - ... } - ... ) - """ - try: - # Load default config - default_config = load_config() - - # Update config if provided - if config: - if isinstance(config, dict): - # Create a deep copy of the default config - updated_config = copy.deepcopy(default_config) - # Update the copy with user-provided values - updated_config.configure(**config) - default_config = updated_config - elif isinstance(config, Config): - # If it's already a Config object, use it directly - default_config = config - else: - raise ValueError("Config must be either a dictionary or a Config object") - - main_config = default_config.config.get('main', {}) - - # Use provided tts_model if specified, otherwise use the one from config - if tts_model is None: - tts_model = main_config.get('default_tts_model', 'openai') - - if transcript_file: - return process_links( - [], - transcript_file=transcript_file, - tts_model=tts_model, - generate_audio=not transcript_only, - config=default_config, - conversation_config=conversation_config - ) - else: - urls_list = urls or [] - if url_file: - with open(url_file, 'r') as file: - urls_list.extend([line.strip() for line in file if line.strip()]) - - if not urls_list: - raise ValueError( - "No URLs provided. Please provide either 'urls', 'url_file', or 'transcript_file'." - ) - - return process_links( - urls_list, - tts_model=tts_model, - generate_audio=not transcript_only, - config=default_config, - conversation_config=conversation_config - ) - - except Exception as e: - logger.error(f"An error occurred: {str(e)}") - raise \ No newline at end of file + """ + Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. + + Args: + urls (Optional[List[str]]): List of URLs to process. + url_file (Optional[str]): Path to a file containing URLs, one per line. + transcript_file (Optional[str]): Path to a transcript file. + tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs'). + transcript_only (bool): Generate only a transcript without audio. Defaults to False. + config (Optional[Dict[str, Any]]): User-provided configuration dictionary. + conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary. + + Returns: + Optional[str]: Path to the final podcast audio file, or None if only generating a transcript. + + Example: + >>> from podcastfy.client import generate_podcast + >>> result = generate_podcast( + ... urls=['https://example.com/article1', 'https://example.com/article2'], + ... tts_model='elevenlabs', + ... config={ + ... 'main': { + ... 'default_tts_model': 'elevenlabs' + ... }, + ... 'output_directories': { + ... 'audio': '/custom/path/to/audio', + ... 'transcripts': '/custom/path/to/transcripts' + ... } + ... }, + ... conversation_config={ + ... 'word_count': 150, + ... 'conversation_style': ['informal', 'friendly'], + ... 'podcast_name': 'My Custom Podcast' + ... } + ... ) + """ + try: + # Load default config + default_config = load_config() + + # Update config if provided + if config: + if isinstance(config, dict): + # Create a deep copy of the default config + updated_config = copy.deepcopy(default_config) + # Update the copy with user-provided values + updated_config.configure(**config) + default_config = updated_config + elif isinstance(config, Config): + # If it's already a Config object, use it directly + default_config = config + else: + raise ValueError( + "Config must be either a dictionary or a Config object" + ) + + main_config = default_config.config.get("main", {}) + + # Use provided tts_model if specified, otherwise use the one from config + if tts_model is None: + tts_model = main_config.get("default_tts_model", "openai") + + if transcript_file: + return process_links( + [], + transcript_file=transcript_file, + tts_model=tts_model, + generate_audio=not transcript_only, + config=default_config, + conversation_config=conversation_config, + ) + else: + urls_list = urls or [] + if url_file: + with open(url_file, "r") as file: + urls_list.extend([line.strip() for line in file if line.strip()]) + + if not urls_list: + raise ValueError( + "No URLs provided. Please provide either 'urls', 'url_file', or 'transcript_file'." + ) + + return process_links( + urls_list, + tts_model=tts_model, + generate_audio=not transcript_only, + config=default_config, + conversation_config=conversation_config, + ) + + except Exception as e: + logger.error(f"An error occurred: {str(e)}") + raise diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index 81fdd4fc..ea502d6d 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import List, Optional, Dict, Any, Union -from podcastfy.aiengines.llm.legacy_gemini_langchain import DefaultPodcastifyTranscriptEngine +from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS from podcastfy.core.character import Character from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend @@ -18,43 +18,62 @@ app = typer.Typer() + def create_characters(config: Dict[str, Any]) -> List[Character]: host = Character( name="Host", role="Podcast host", tts_configs={ - "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["question"], backend="openai"), - "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["question"], backend="elevenlabs"), + "openai": TTSConfig( + voice=config["text_to_speech"]["openai"]["default_voices"]["question"], + backend="openai", + ), + "elevenlabs": TTSConfig( + voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ + "question" + ], + backend="elevenlabs", + ), }, - default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly." + default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.", ) guest = Character( name="Guest", role="Expert guest", tts_configs={ - "openai": TTSConfig(voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], backend="openai"), - "elevenlabs": TTSConfig(voice=config["text_to_speech"]["elevenlabs"]["default_voices"]["answer"], backend="elevenlabs"), + "openai": TTSConfig( + voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], + backend="openai", + ), + "elevenlabs": TTSConfig( + voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ + "answer" + ], + backend="elevenlabs", + ), }, - default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner." + default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.", ) return [host, guest] + def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]: return [ OpenAITTS(api_key=config.OPENAI_API_KEY), ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY), - EdgeTTS() + EdgeTTS(), ] + def process_links( links: List[str], transcript_file: Optional[str] = None, tts_model: str = "openai", # could be removed now ? generate_audio: bool = True, config: Optional[Config] = None, - conversation_config: Optional[Dict[str, Any]] = None + conversation_config: Optional[Dict[str, Any]] = None, ) -> Podcast: if config is None: config = load_config() @@ -62,12 +81,16 @@ def process_links( tts_backends = create_tts_backends(config) if transcript_file: logger.info(f"Using transcript file: {transcript_file}") - transcript = Transcript.load(transcript_file, {char.name: char for char in characters}) + transcript = Transcript.load( + transcript_file, {char.name: char for char in characters} + ) podcast = Podcast.from_transcript(transcript, tts_backends, characters) else: logger.info(f"Processing {len(links)} links") content_extractor = ContentExtractor(config.JINA_API_KEY) - content_generator = DefaultPodcastifyTranscriptEngine(config.GEMINI_API_KEY, conversation_config) + content_generator = DefaultPodcastifyTranscriptEngine( + config.GEMINI_API_KEY, conversation_config + ) contents = [content_extractor.extract_content(link) for link in links] combined_content = "\n\n".join(contents) @@ -92,21 +115,36 @@ def process_links( @app.command() def main( urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"), - file: typer.FileText = typer.Option(None, "--file", "-f", help="File containing URLs, one per line"), - transcript: typer.FileText = typer.Option(None, "--transcript", "-t", help="Path to a transcript file"), - tts_model: str = typer.Option(None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)"), - transcript_only: bool = typer.Option(False, "--transcript-only", help="Generate only a transcript without audio"), - conversation_config: str = typer.Option(None, "--conversation-config", "-cc", help="Path to custom conversation configuration YAML file"), - output_dir: str = typer.Option("./output", "--output-dir", "-o", help="Directory to save output files"), + file: typer.FileText = typer.Option( + None, "--file", "-f", help="File containing URLs, one per line" + ), + transcript: typer.FileText = typer.Option( + None, "--transcript", "-t", help="Path to a transcript file" + ), + tts_model: str = typer.Option( + None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)" + ), + transcript_only: bool = typer.Option( + False, "--transcript-only", help="Generate only a transcript without audio" + ), + conversation_config: str = typer.Option( + None, + "--conversation-config", + "-cc", + help="Path to custom conversation configuration YAML file", + ), + output_dir: str = typer.Option( + "./output", "--output-dir", "-o", help="Directory to save output files" + ), ): """ Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. """ try: config = load_config() - main_config = config.config.get('main', {}) + main_config = config.config.get("main", {}) if tts_model is None: - tts_model = main_config.get('default_tts_model', 'openai') + tts_model = main_config.get("default_tts_model", "openai") urls_list = urls or [] if file: @@ -123,7 +161,7 @@ def main( tts_model=tts_model, generate_audio=not transcript_only, config=config, - conversation_config=conversation_config + conversation_config=conversation_config, ) output_dir = Path(output_dir) @@ -138,16 +176,20 @@ def main( podcast.save(str(audio_file)) transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt" podcast.export_transcript(str(transcript_file)) - typer.echo(f"Podcast generated successfully using {tts_model} TTS model: {audio_file}") + typer.echo( + f"Podcast generated successfully using {tts_model} TTS model: {audio_file}" + ) typer.echo(f"Transcript saved to: {transcript_file}") except Exception as e: typer.echo(f"An error occurred: {str(e)}", err=True) raise typer.Exit(code=1) + if __name__ == "__main__": app() + def generate_podcast( urls: Optional[List[str]] = None, url_file: Optional[str] = None, @@ -155,7 +197,7 @@ def generate_podcast( tts_model: Optional[str] = None, transcript_only: bool = False, config: Optional[Dict[str, Any]] = None, - conversation_config: Optional[Dict[str, Any]] = None + conversation_config: Optional[Dict[str, Any]] = None, ) -> Podcast: """ Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. @@ -206,16 +248,18 @@ def generate_podcast( elif isinstance(config, Config): default_config = config else: - raise ValueError("Config must be either a dictionary or a Config object") + raise ValueError( + "Config must be either a dictionary or a Config object" + ) - main_config = default_config.config.get('main', {}) + main_config = default_config.config.get("main", {}) if tts_model is None: - tts_model = main_config.get('default_tts_model', 'openai') + tts_model = main_config.get("default_tts_model", "openai") urls_list = urls or [] if url_file: - with open(url_file, 'r') as file: + with open(url_file, "r") as file: urls_list.extend([line.strip() for line in file if line.strip()]) if not urls_list and not transcript_file: @@ -229,11 +273,11 @@ def generate_podcast( tts_model=tts_model, generate_audio=not transcript_only, config=default_config, - conversation_config=conversation_config + conversation_config=conversation_config, ) return podcast except Exception as e: logger.error(f"An error occurred: {str(e)}") - raise \ No newline at end of file + raise diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index 7d4c383e..9b422faf 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -30,7 +30,7 @@ def audio(self) -> PydubAudioSegment: class AudioManager: - def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 1) -> None: + def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4) -> None: self.tts_backends = tts_backends self.n_jobs = n_jobs self.audio_segments = [] diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py index 6e109b44..774e80b6 100644 --- a/podcastfy/text_to_speech.py +++ b/podcastfy/text_to_speech.py @@ -17,259 +17,280 @@ logger = logging.getLogger(__name__) + class TextToSpeech: - def __init__(self, model: str = 'openai', api_key: Optional[str] = None): - """ - Initialize the TextToSpeech class. - - Args: - model (str): The model to use for text-to-speech conversion. - Options are 'elevenlabs' or 'openai'. Defaults to 'openai'. - api_key (Optional[str]): API key for the selected text-to-speech service. - If not provided, it will be loaded from the config. - """ - self.model = model.lower() - self.config = load_config() - self.tts_config = self.config.get('text_to_speech') - - if self.model == 'elevenlabs': - self.api_key = api_key or self.config.ELEVENLABS_API_KEY - self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key) - elif self.model == 'openai': - self.api_key = api_key or self.config.OPENAI_API_KEY - openai.api_key = self.api_key - else: - raise ValueError("Invalid model. Choose 'elevenlabs' or 'openai'.") - - self.audio_format = self.tts_config['audio_format'] - self.temp_audio_dir = self.tts_config['temp_audio_dir'] - self.ending_message = self.tts_config['ending_message'] - - # Create temp_audio_dir if it doesn't exist - if not os.path.exists(self.temp_audio_dir): - os.makedirs(self.temp_audio_dir) - - def __merge_audio_files(self, input_dir: str, output_file: str) -> None: - """ - Merge all audio files in the input directory sequentially and save the result. - - Args: - input_dir (str): Path to the directory containing audio files. - output_file (str): Path to save the merged audio file. - """ - try: - # Function to sort filenames naturally - def natural_sort_key(filename: str) -> List[Union[int, str]]: - return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] - - combined = AudioSegment.empty() - audio_files = sorted( - [f for f in os.listdir(input_dir) if f.endswith(f".{self.audio_format}")], - key=natural_sort_key - ) - for file in audio_files: - if file.endswith(f".{self.audio_format}"): - file_path = os.path.join(input_dir, file) - combined += AudioSegment.from_file(file_path, format=self.audio_format) - - combined.export(output_file, format=self.audio_format) - logger.info(f"Merged audio saved to {output_file}") - except Exception as e: - logger.error(f"Error merging audio files: {str(e)}") - raise - - def convert_to_speech(self, text: str, output_file: str) -> None: - """ - Convert input text to speech and save as an audio file. - - Args: - text (str): Input text to convert to speech. - output_file (str): Path to save the output audio file. - - Raises: - Exception: If there's an error in converting text to speech. - """ - # Clean TSS markup tags from the input text - cleaned_text = self.clean_tss_markup(text) - - if self.model == 'elevenlabs': - self.__convert_to_speech_elevenlabs(cleaned_text, output_file) - elif self.model == 'openai': - self.__convert_to_speech_openai(cleaned_text, output_file) - - def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None: - try: - qa_pairs = self.split_qa(text) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - question_audio = self.client.generate( - text=question, - voice=self.tts_config['elevenlabs']['default_voices']['question'], - model=self.tts_config['elevenlabs']['model'] - ) - answer_audio = self.client.generate( - text=answer, - voice=self.tts_config['elevenlabs']['default_voices']['answer'], - model=self.tts_config['elevenlabs']['model'] - ) - - # Save question and answer audio chunks - for audio in [question_audio, answer_audio]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - with open(file_name, "wb") as out: - for chunk in audio: - if chunk: - out.write(chunk) - audio_files.append(file_name) - - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}") - raise - - def __convert_to_speech_openai(self, text: str, output_file: str) -> None: - try: - qa_pairs = self.split_qa(text) - print(qa_pairs) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - for speaker, content in [ - (self.tts_config['openai']['default_voices']['question'], question), - (self.tts_config['openai']['default_voices']['answer'], answer) - ]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - response = openai.audio.speech.create( - model=self.tts_config['openai']['model'], - voice=speaker, - input=content - ) - with open(file_name, "wb") as file: - file.write(response.content) - - audio_files.append(file_name) - - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with OpenAI: {str(e)}") - raise - - def split_qa(self, input_text: str) -> List[Tuple[str, str]]: - """ - Split the input text into question-answer pairs. - - Args: - input_text (str): The input text containing Person1 and Person2 dialogues. - - Returns: - List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues. - """ - # Add ending message to the end of input_text - input_text += f"{self.ending_message}" - - # Regular expression pattern to match Person1 and Person2 dialogues - pattern = r'(.*?)\s*(.*?)' - - # Find all matches in the input text - matches = re.findall(pattern, input_text, re.DOTALL) - - # Process the matches to remove extra whitespace and newlines - processed_matches = [ - ( - ' '.join(person1.split()).strip(), - ' '.join(person2.split()).strip() - ) - for person1, person2 in matches - ] - return processed_matches - - def clean_tss_markup(self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]) -> str: - """ - Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. - - Args: - input_text (str): The input text containing TSS markup tags. - additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"]. - - Returns: - str: Cleaned text with unsupported TSS markup tags removed. - """ - # List of SSML tags supported by both OpenAI and ElevenLabs - supported_tags = [ - 'speak', 'break', 'lang', 'p', 'phoneme', - 's', 'say-as', 'sub' - ] - - # Append additional tags to the supported tags list - supported_tags.extend(additional_tags) - - # Create a pattern that matches any tag not in the supported list - pattern = r']+>' - - # Remove unsupported tags - cleaned_text = re.sub(pattern, '', input_text) - - # Remove any leftover empty lines - cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text) - - # Ensure closing tags for additional tags are preserved - for tag in additional_tags: - cleaned_text = re.sub(f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)', - f'<{tag}>\\1', - cleaned_text, - flags=re.DOTALL) - - return cleaned_text.strip() + def __init__(self, model: str = "openai", api_key: Optional[str] = None): + """ + Initialize the TextToSpeech class. + + Args: + model (str): The model to use for text-to-speech conversion. + Options are 'elevenlabs' or 'openai'. Defaults to 'openai'. + api_key (Optional[str]): API key for the selected text-to-speech service. + If not provided, it will be loaded from the config. + """ + self.model = model.lower() + self.config = load_config() + self.tts_config = self.config.get("text_to_speech") + + if self.model == "elevenlabs": + self.api_key = api_key or self.config.ELEVENLABS_API_KEY + self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key) + elif self.model == "openai": + self.api_key = api_key or self.config.OPENAI_API_KEY + openai.api_key = self.api_key + else: + raise ValueError("Invalid model. Choose 'elevenlabs' or 'openai'.") + + self.audio_format = self.tts_config["audio_format"] + self.temp_audio_dir = self.tts_config["temp_audio_dir"] + self.ending_message = self.tts_config["ending_message"] + + # Create temp_audio_dir if it doesn't exist + if not os.path.exists(self.temp_audio_dir): + os.makedirs(self.temp_audio_dir) + + def __merge_audio_files(self, input_dir: str, output_file: str) -> None: + """ + Merge all audio files in the input directory sequentially and save the result. + + Args: + input_dir (str): Path to the directory containing audio files. + output_file (str): Path to save the merged audio file. + """ + try: + # Function to sort filenames naturally + def natural_sort_key(filename: str) -> List[Union[int, str]]: + return [ + int(text) if text.isdigit() else text + for text in re.split(r"(\d+)", filename) + ] + + combined = AudioSegment.empty() + audio_files = sorted( + [ + f + for f in os.listdir(input_dir) + if f.endswith(f".{self.audio_format}") + ], + key=natural_sort_key, + ) + for file in audio_files: + if file.endswith(f".{self.audio_format}"): + file_path = os.path.join(input_dir, file) + combined += AudioSegment.from_file( + file_path, format=self.audio_format + ) + + combined.export(output_file, format=self.audio_format) + logger.info(f"Merged audio saved to {output_file}") + except Exception as e: + logger.error(f"Error merging audio files: {str(e)}") + raise + + def convert_to_speech(self, text: str, output_file: str) -> None: + """ + Convert input text to speech and save as an audio file. + + Args: + text (str): Input text to convert to speech. + output_file (str): Path to save the output audio file. + + Raises: + Exception: If there's an error in converting text to speech. + """ + # Clean TSS markup tags from the input text + cleaned_text = self.clean_tss_markup(text) + + if self.model == "elevenlabs": + self.__convert_to_speech_elevenlabs(cleaned_text, output_file) + elif self.model == "openai": + self.__convert_to_speech_openai(cleaned_text, output_file) + + def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None: + try: + qa_pairs = self.split_qa(text) + audio_files = [] + counter = 0 + for question, answer in qa_pairs: + question_audio = self.client.generate( + text=question, + voice=self.tts_config["elevenlabs"]["default_voices"]["question"], + model=self.tts_config["elevenlabs"]["model"], + ) + answer_audio = self.client.generate( + text=answer, + voice=self.tts_config["elevenlabs"]["default_voices"]["answer"], + model=self.tts_config["elevenlabs"]["model"], + ) + + # Save question and answer audio chunks + for audio in [question_audio, answer_audio]: + counter += 1 + file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" + with open(file_name, "wb") as out: + for chunk in audio: + if chunk: + out.write(chunk) + audio_files.append(file_name) + + # Merge all audio files and save the result + self.__merge_audio_files(self.temp_audio_dir, output_file) + + # Clean up individual audio files + for file in audio_files: + os.remove(file) + + logger.info(f"Audio saved to {output_file}") + + except Exception as e: + logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}") + raise + + def __convert_to_speech_openai(self, text: str, output_file: str) -> None: + try: + qa_pairs = self.split_qa(text) + print(qa_pairs) + audio_files = [] + counter = 0 + for question, answer in qa_pairs: + for speaker, content in [ + (self.tts_config["openai"]["default_voices"]["question"], question), + (self.tts_config["openai"]["default_voices"]["answer"], answer), + ]: + counter += 1 + file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" + response = openai.audio.speech.create( + model=self.tts_config["openai"]["model"], + voice=speaker, + input=content, + ) + with open(file_name, "wb") as file: + file.write(response.content) + + audio_files.append(file_name) + + # Merge all audio files and save the result + self.__merge_audio_files(self.temp_audio_dir, output_file) + + # Clean up individual audio files + for file in audio_files: + os.remove(file) + + logger.info(f"Audio saved to {output_file}") + + except Exception as e: + logger.error(f"Error converting text to speech with OpenAI: {str(e)}") + raise + + def split_qa(self, input_text: str) -> List[Tuple[str, str]]: + """ + Split the input text into question-answer pairs. + + Args: + input_text (str): The input text containing Person1 and Person2 dialogues. + + Returns: + List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues. + """ + # Add ending message to the end of input_text + input_text += f"{self.ending_message}" + + # Regular expression pattern to match Person1 and Person2 dialogues + pattern = r"(.*?)\s*(.*?)" + + # Find all matches in the input text + matches = re.findall(pattern, input_text, re.DOTALL) + + # Process the matches to remove extra whitespace and newlines + processed_matches = [ + (" ".join(person1.split()).strip(), " ".join(person2.split()).strip()) + for person1, person2 in matches + ] + return processed_matches + + def clean_tss_markup( + self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"] + ) -> str: + """ + Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. + + Args: + input_text (str): The input text containing TSS markup tags. + additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"]. + + Returns: + str: Cleaned text with unsupported TSS markup tags removed. + """ + # List of SSML tags supported by both OpenAI and ElevenLabs + supported_tags = [ + "speak", + "break", + "lang", + "p", + "phoneme", + "s", + "say-as", + "sub", + ] + + # Append additional tags to the supported tags list + supported_tags.extend(additional_tags) + + # Create a pattern that matches any tag not in the supported list + pattern = r"]+>" + + # Remove unsupported tags + cleaned_text = re.sub(pattern, "", input_text) + + # Remove any leftover empty lines + cleaned_text = re.sub(r"\n\s*\n", "\n", cleaned_text) + + # Ensure closing tags for additional tags are preserved + for tag in additional_tags: + cleaned_text = re.sub( + f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)', + f"<{tag}>\\1", + cleaned_text, + flags=re.DOTALL, + ) + + return cleaned_text.strip() + def main(seed: int = 42) -> None: - """ - Main function to test the TextToSpeech class. - - Args: - seed (int): Random seed for reproducibility. Defaults to 42. - """ - try: - # Load configuration - config = load_config() - - # Read input text from file - with open('tests/data/response.txt', 'r') as file: - input_text = file.read() - - # Test ElevenLabs - tts_elevenlabs = TextToSpeech(model='elevenlabs') - elevenlabs_output_file = 'tests/data/response_elevenlabs.mp3' - tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file) - logger.info(f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}") - - # Test OpenAI - tts_openai = TextToSpeech(model='openai') - openai_output_file = 'tests/data/response_openai.mp3' - tts_openai.convert_to_speech(input_text, openai_output_file) - logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}") - - except Exception as e: - logger.error(f"An error occurred during text-to-speech conversion: {str(e)}") - raise + """ + Main function to test the TextToSpeech class. + + Args: + seed (int): Random seed for reproducibility. Defaults to 42. + """ + try: + # Load configuration + config = load_config() + + # Read input text from file + with open("tests/data/response.txt", "r") as file: + input_text = file.read() + + # Test ElevenLabs + tts_elevenlabs = TextToSpeech(model="elevenlabs") + elevenlabs_output_file = "tests/data/response_elevenlabs.mp3" + tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file) + logger.info( + f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}" + ) + + # Test OpenAI + tts_openai = TextToSpeech(model="openai") + openai_output_file = "tests/data/response_openai.mp3" + tts_openai.convert_to_speech(input_text, openai_output_file) + logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}") + + except Exception as e: + logger.error(f"An error occurred during text-to-speech conversion: {str(e)}") + raise + if __name__ == "__main__": - main(seed=42) \ No newline at end of file + main(seed=42) diff --git a/tests/test_content_parser.py b/tests/test_content_parser.py index 19a7ebce..e31bf7a6 100644 --- a/tests/test_content_parser.py +++ b/tests/test_content_parser.py @@ -81,4 +81,4 @@ def test_pdf_extractor(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_genai_podcast.py b/tests/test_genai_podcast.py index a3ab67b3..deba76c0 100644 --- a/tests/test_genai_podcast.py +++ b/tests/test_genai_podcast.py @@ -7,7 +7,7 @@ from podcastfy.utils.config import Config from podcastfy.utils.config_conversation import ConversationConfig -#TODO: Should be a fixture +# TODO: Should be a fixture def sample_conversation_config(): conversation_config = { "word_count": 2000, @@ -19,10 +19,11 @@ def sample_conversation_config(): "podcast_tagline": "Learning Through Conversation", "output_language": "English", "engagement_techniques": ["examples", "questions", "case studies"], - "creativity": 0 + "creativity": 0, } return conversation_config + class TestGenAIPodcast(unittest.TestCase): def setUp(self): """ @@ -31,9 +32,6 @@ def setUp(self): config = Config() self.api_key = config.GEMINI_API_KEY - - - def test_generate_qa_content(self): """ Test the generate_qa_content method of ContentGenerator. @@ -56,21 +54,29 @@ def test_custom_conversation_config(self): conversation_config = sample_conversation_config() content_generator = ContentGenerator(self.api_key, conversation_config) input_text = "Artificial Intelligence in Education" - + result = content_generator.generate_qa_content(input_text) self.assertIsNotNone(result) self.assertNotEqual(result, "") self.assertIsInstance(result, str) - + # Check for elements from the custom config self.assertIn(conversation_config["podcast_name"], result) self.assertIn(conversation_config["podcast_tagline"], result) - self.assertTrue(any(role in result.lower() for role in [conversation_config["roles_person1"], - conversation_config["roles_person2"]])) - + self.assertTrue( + any( + role in result.lower() + for role in [ + conversation_config["roles_person1"], + conversation_config["roles_person2"], + ] + ) + ) + # Check word count (allow some flexibility) word_count = len(result.split()) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py index 9f65c749..aa9d152a 100644 --- a/tests/test_generate_podcast.py +++ b/tests/test_generate_podcast.py @@ -5,99 +5,112 @@ from podcastfy.utils.config import load_config from podcastfy.utils.config_conversation import load_conversation_config + @pytest.fixture def sample_config(): - config = load_config() - config.configure( - output_directories={ - 'audio': 'tests/data/audio', - 'transcripts': 'tests/data/transcripts' - } - ) - return config + config = load_config() + config.configure( + output_directories={ + "audio": "tests/data/audio", + "transcripts": "tests/data/transcripts", + } + ) + return config + @pytest.fixture def sample_conversation_config(): - conversation_config = { - "word_count": 300, - "conversation_style": ["formal", "educational"], - "roles_person1": "professor", - "roles_person2": "student", - "dialogue_structure": ["Introduction", "Main Points", "Case Studies", "Quiz", "Conclusion"], - "podcast_name": "Teachfy", - "podcast_tagline": "Learning Through Conversation", - "output_language": "English", - "engagement_techniques": ["examples", "questions"], - "creativity": 0 - } - return conversation_config + conversation_config = { + "word_count": 300, + "conversation_style": ["formal", "educational"], + "roles_person1": "professor", + "roles_person2": "student", + "dialogue_structure": [ + "Introduction", + "Main Points", + "Case Studies", + "Quiz", + "Conclusion", + ], + "podcast_name": "Teachfy", + "podcast_tagline": "Learning Through Conversation", + "output_language": "English", + "engagement_techniques": ["examples", "questions"], + "creativity": 0, + } + return conversation_config + def test_generate_podcast_from_urls(sample_config): - """Test generating a podcast from a list of URLs.""" - urls = [ - "https://en.wikipedia.org/wiki/Podcast", - "https://en.wikipedia.org/wiki/Text-to-speech" - ] - - audio_file = generate_podcast( - urls=urls, - config=sample_config - ) - - assert audio_file is not None - assert os.path.exists(audio_file) - assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') + """Test generating a podcast from a list of URLs.""" + urls = [ + "https://en.wikipedia.org/wiki/Podcast", + "https://en.wikipedia.org/wiki/Text-to-speech", + ] + + audio_file = generate_podcast(urls=urls, config=sample_config) + + assert audio_file is not None + assert os.path.exists(audio_file) + assert audio_file.endswith(".mp3") + assert os.path.dirname(audio_file) == sample_config.get( + "output_directories", {} + ).get("audio") + def test_generate_transcript_only(sample_config): - """Test generating only a transcript without audio.""" - urls = ["https://en.wikipedia.org/wiki/Natural_language_processing"] - - result = generate_podcast( - urls=urls, - transcript_only=True, - config=sample_config - ) - - assert result is None + """Test generating only a transcript without audio.""" + urls = ["https://en.wikipedia.org/wiki/Natural_language_processing"] + + result = generate_podcast(urls=urls, transcript_only=True, config=sample_config) + + assert result is None + def test_generate_podcast_from_transcript_file(sample_config): - """Test generating a podcast from an existing transcript file.""" - # First, generate a transcript - transcript_file = os.path.join(sample_config.get('output_directories', {}).get('transcripts'), 'test_transcript.txt') - with open(transcript_file, 'w') as f: - f.write("Joe Biden and the US PoliticsJoe Biden is the current president of the United States of America") - - # Now use this transcript to generate a podcast - audio_file = generate_podcast( - transcript_file=transcript_file, - config=sample_config - ) - - assert audio_file is not None - assert os.path.exists(audio_file) - assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') + """Test generating a podcast from an existing transcript file.""" + # First, generate a transcript + transcript_file = os.path.join( + sample_config.get("output_directories", {}).get("transcripts"), + "test_transcript.txt", + ) + with open(transcript_file, "w") as f: + f.write( + "Joe Biden and the US PoliticsJoe Biden is the current president of the United States of America" + ) + + # Now use this transcript to generate a podcast + audio_file = generate_podcast(transcript_file=transcript_file, config=sample_config) + + assert audio_file is not None + assert os.path.exists(audio_file) + assert audio_file.endswith(".mp3") + assert os.path.dirname(audio_file) == sample_config.get( + "output_directories", {} + ).get("audio") + def test_generate_podcast_with_custom_config(sample_config, sample_conversation_config): - """Test generating a podcast with a custom conversation config.""" - urls = ["https://en.wikipedia.org/wiki/Artificial_intelligence"] - - audio_file = generate_podcast( - urls=urls, - config=sample_config, - conversation_config=sample_conversation_config - ) - - assert audio_file is not None - assert os.path.exists(audio_file) - assert audio_file.endswith('.mp3') - assert os.path.dirname(audio_file) == sample_config.get('output_directories', {}).get('audio') - + """Test generating a podcast with a custom conversation config.""" + urls = ["https://en.wikipedia.org/wiki/Artificial_intelligence"] + + audio_file = generate_podcast( + urls=urls, config=sample_config, conversation_config=sample_conversation_config + ) + + assert audio_file is not None + assert os.path.exists(audio_file) + assert audio_file.endswith(".mp3") + assert os.path.dirname(audio_file) == sample_config.get( + "output_directories", {} + ).get("audio") + + def test_generate_podcast_no_urls_or_transcript(): - """Test that an error is raised when no URLs or transcript file is provided.""" - with pytest.raises(ValueError): - generate_podcast() + """Test that an error is raised when no URLs or transcript file is provided.""" + with pytest.raises(ValueError): + generate_podcast() + if __name__ == "__main__": - pytest.main() \ No newline at end of file + pytest.main() From c1adb9b5eb6ae884fefbda69c1819ad281c4f95f Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 14:53:44 +0200 Subject: [PATCH 06/49] fix transcript parsing --- podcastfy/aiengines/llm/gemini_langchain.py | 65 +++++++++++++++++---- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py index 4e08b0af..9380a4f6 100644 --- a/podcastfy/aiengines/llm/gemini_langchain.py +++ b/podcastfy/aiengines/llm/gemini_langchain.py @@ -7,6 +7,7 @@ """ import os +import re from typing import Optional, Dict, Any, List, Tuple from langchain_google_genai import ChatGoogleGenerativeAI @@ -41,12 +42,12 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = self.llm = ChatGoogleGenerativeAI( model=self.content_generator_config.get('gemini_model', 'gemini-1.5-pro-latest'), temperature=self.config_conversation.get('creativity', 0), - max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192) + max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192), ) #pick podcastfy prompt from langchain hub self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_')) - self.prompt_template + self.ending_message = self.config.get('text_to_speech')['ending_message'] self.parser = StrOutputParser() @@ -109,20 +110,62 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = """ self.content_generator = ContentGenerator(api_key, conversation_config) + def split_qa(self, input_text: str) -> List[Tuple[str, str]]: + """ + Split the input text into question-answer pairs. + + Args: + input_text (str): The input text containing Person1 and Person2 dialogues. + + Returns: + List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues. + """ + # Add ending message to the end of input_text + input_text += f"{self.content_generator.ending_message}" + + # Regular expression pattern to match Person1 and Person2 dialogues + pattern = r'(.*?)\s*(.*?)' + + # Find all matches in the input text + matches = re.findall(pattern, input_text, re.DOTALL) + + # Process the matches to remove extra whitespace and newlines + processed_matches = [ + ( + ' '.join(person1.split()).strip(), + ' '.join(person2.split()).strip() + ) + for person1, person2 in matches + ] + return processed_matches + def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters) - - # Parse the generated content into the required format + + q_a_pairs = self.split_qa(content) transcript = [] - for line in content.split('\n'): - if ':' in line: - speaker_name, text = line.split(':', 1) - speaker = next((char for char in characters if char.name == speaker_name.strip()), None) - if speaker: - transcript.append((speaker, text.strip())) - + for q_a_pair in q_a_pairs: + # Assign the speakers based on the order of the characters + speaker1, speaker2 = characters + speaker_1_text, speaker_2_text = q_a_pair + transcript.append((speaker1, speaker_1_text)) + transcript.append((speaker2, speaker_2_text)) return transcript + # def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: + # content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters) + # + # # Parse the generated content into the required format + # transcript = [] + # for line in content.split('\n'): + # if ':' in line: + # speaker_name, text = line.split(':', 1) + # speaker = next((char for char in characters if char.name == speaker_name.strip()), None) + # if speaker: + # transcript.append((speaker, text.strip())) + # + # return transcript + def main(seed: int = 42) -> None: From d06b93c3be3adc0c73a53d3e8196400f1355cdb9 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 14:54:22 +0200 Subject: [PATCH 07/49] fix eleven labs issues --- podcastfy/aiengines/tts/tts_backends.py | 30 ++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 0b2d389c..1e4d4125 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -15,12 +15,12 @@ from podcastfy.core.character import Character -class ElevenLabsTTS(SyncTTSBackend, TTSConfigMixin): +class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin): name: str = "elevenlabs" def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'): # TODO: not the right path for final client - TTSConfigMixin.__init__(self, config_file) + TTSConfigMixin.__init__(self, config_file, name=self.name) self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: @@ -37,6 +37,19 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> out.write(chunk) return output_path + async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + config = self.tts_config_for_character(character) + client = elevenlabs_client.AsyncElevenLabs(api_key=self.api_key) + content = await client.generate( + text=text, + voice=config.voice, + model=config.extra_args.get('model', self.get_default_config().get('model', 'default')) + ) + with open(output_path, "wb") as out: + for chunk in content: + if chunk: + out.write(chunk) + class OpenAITTS(SyncTTSBackend, TTSConfigMixin): name: str = "openai" @@ -45,7 +58,7 @@ def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yam TTSConfigMixin.__init__(self, config_file, name=self.name) self.api_key = api_key or os.getenv("OPENAI_API_KEY") - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + def text_to_speech(self, text: str, character: Character, output_path: Path) -> None: config = self.tts_config_for_character(character) print(f"OpenAI TTS: Converting text to speech for character {character.name} with voice {config.voice} \n text: {text}") @@ -57,23 +70,20 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> ) with open(output_path, "wb") as file: file.write(response.content) - return output_path + class EdgeTTS(AsyncTTSBackend, TTSConfigMixin): - name: str = "edge-tts" + name: str = "edge" def __init__(self, config_file: str = 'podcastfy/config.yaml'): - TTSConfigMixin.__init__(self, config_file) + TTSConfigMixin.__init__(self, config_file, name=self.name) - async def text_to_speech(self, text: str, character: Character, output_path: Path) -> None: + async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None: config = self.tts_config_for_character(character) communicate = edge_tts.Communicate(text, config.voice) await communicate.save(output_path) - return output_path - async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - return await self.text_to_speech(text, character, output_path) From 1e158513f057990e66f398d7df4a39fa711ce136 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 14:54:47 +0200 Subject: [PATCH 08/49] fix person names --- podcastfy/client_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index ea502d6d..adf2f1b5 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -21,7 +21,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]: host = Character( - name="Host", + name="Person1", role="Podcast host", tts_configs={ "openai": TTSConfig( @@ -39,7 +39,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]: ) guest = Character( - name="Guest", + name="Person2", role="Expert guest", tts_configs={ "openai": TTSConfig( From 114172419463647a69dae0c78bc00088572ca401 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 14:55:05 +0200 Subject: [PATCH 09/49] add edge default values --- podcastfy/config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/podcastfy/config.yaml b/podcastfy/config.yaml index 624c8955..00bff294 100644 --- a/podcastfy/config.yaml +++ b/podcastfy/config.yaml @@ -13,6 +13,10 @@ text_to_speech: question: "echo" answer: "shimmer" model: "tts-1-hd" + edge: + default_voices: + question: "en-US-JennyNeural" + answer: "en-US-EricNeural" audio_format: "mp3" temp_audio_dir: "data/audio/tmp/" ending_message: "Bye Bye!" From c44139b2a4df6368010ce2dde112006689218d77 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 14:56:07 +0200 Subject: [PATCH 10/49] fix multiple issues with audio --- podcastfy/core/audio.py | 82 ++++++++++++++++++++------------------- podcastfy/core/podcast.py | 16 +++++--- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index 9b422faf..2e57f394 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -1,91 +1,95 @@ import asyncio from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Optional, Dict, Union, List, cast +from typing import Optional, Dict, Union, List, cast, Tuple -from pydub import AudioSegment as PydubAudioSegment +from pydub import AudioSegment from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend from podcastfy.core.transcript import TranscriptSegment, Transcript -class AudioSegment: +class PodcastsAudioSegment: """Represents an audio segment of the podcast.""" - def __init__(self, filepath: Path, length_ms: int, transcript_segment: Optional[TranscriptSegment] = None) -> None: + def __init__(self, filepath: Path, transcript_segment: Optional[TranscriptSegment] = None) -> None: self.filepath = filepath - self.length_ms = length_ms self.transcript_segment = transcript_segment - self._audio: Optional[PydubAudioSegment] = None + self._audio: Optional[AudioSegment] = None @property - def audio(self) -> PydubAudioSegment: + def audio(self) -> AudioSegment: """Lazy-load the audio segment.""" if self._audio is None: - self._audio = PydubAudioSegment.from_file(self.filepath) - if len(self._audio) != self.length_ms: - raise ValueError( - f"Audio file length ({len(self._audio)}ms) does not match specified length ({self.length_ms}ms)") + self._audio = AudioSegment.from_file(self.filepath) return self._audio class AudioManager: - def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4) -> None: + def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4, file_prefix: str = "") -> None: self.tts_backends = tts_backends self.n_jobs = n_jobs + self.has_async_backend = any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()) + self.file_prefix = file_prefix self.audio_segments = [] - self.final_audio: Optional[PydubAudioSegment] = None + self.final_audio: Optional[AudioSegment] = None self.temp_dir: Optional[Union[str, Path]] = None - async def _async_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - async def process_segment(segment: TranscriptSegment): + async def _async_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: + async def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): + segment, index = segment_tuple tts_backend = self.get_tts_backend(segment) - audio_file = await cast(AsyncTTSBackend, tts_backend).async_text_to_speech( - segment.text, - segment.speaker, - Path(self.temp_dir) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3" - ) - return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment) + audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.mp3" + if isinstance(tts_backend, AsyncTTSBackend): + await tts_backend.async_text_to_speech( + segment.text, + segment.speaker, + audio_path + ) + else: + tts_backend.text_to_speech( + segment.text, + segment.speaker, + audio_path + ) + return PodcastsAudioSegment(audio_path, segment) semaphore = asyncio.Semaphore(self.n_jobs) - async def bounded_process_segment(segment): + async def bounded_process_segment(segment_tuple): async with semaphore: - return await process_segment(segment) + return await process_segment(segment_tuple) - tasks = [asyncio.create_task(bounded_process_segment(segment)) for segment in transcript.segments] + tasks = [asyncio.create_task(bounded_process_segment((segment, i))) for i, segment in enumerate(transcript.segments)] return list(await asyncio.gather(*tasks)) def get_tts_backend(self, segment): - if segment.speaker.preferred_tts is None: - # take the first available TTS backend + tts_backend = self.tts_backends.get(segment.speaker.preferred_tts) + if tts_backend is None: + # Take the first available TTS backend tts_backend = next(iter(self.tts_backends.values())) - else: - tts_backend = self.tts_backends[segment.speaker.preferred_tts] - # ensure the preferred TTS backend is available - if tts_backend is None: - raise ValueError(f"Preferred TTS backend '{segment.speaker.preferred_tts}' is not available for character '{segment.speaker.name}'") return tts_backend - def _sync_build_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - def process_segment(segment: TranscriptSegment): + def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: + def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): + segment, index = segment_tuple tts_backend = self.get_tts_backend(segment) audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech( segment.text, segment.speaker, - Path(str(self.temp_dir)) / f"{segment.speaker.name}_{len(self.audio_segments)}.mp3" + Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3" ) - return AudioSegment(audio_file, len(PydubAudioSegment.from_file(str(audio_file))), segment) + return PodcastsAudioSegment(audio_file, segment) with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: return list(executor.map(process_segment, transcript.segments)) - def create_audio_segments(self, transcript: Transcript) -> List[AudioSegment]: - if all(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()): + def create_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: + if self.has_async_backend: return asyncio.run(self._async_build_audio_segments(transcript)) else: return self._sync_build_audio_segments(transcript) - def stitch_audio_segments(self) -> None: - self.final_audio = sum([segment.audio for segment in self.audio_segments]) + # def stitch_audio_segments(self) -> None: + # self.final_audio = sum((segment.audio for segment in self.audio_segments), AudioSegment.empty()) diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 06a5e47c..e9797b06 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -3,13 +3,13 @@ from typing import List, Optional, Dict, Any, Callable, Tuple, Union, Sequence, cast from tempfile import TemporaryDirectory import atexit -from pydub import AudioSegment as PydubAudioSegment +from pydub import AudioSegment from functools import wraps from contextlib import contextmanager from podcastfy.aiengines.llm.base import LLMBackend from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend -from podcastfy.core.audio import AudioSegment, AudioManager +from podcastfy.core.audio import PodcastsAudioSegment, AudioManager from podcastfy.core.character import Character from podcastfy.core.transcript import TranscriptSegment, Transcript from podcastfy.core.tts_configs import TTSConfig @@ -93,7 +93,7 @@ def __init__(self, content: str, llm_backend: LLMBackend, # Initialize attributes with null values self.transcript: Optional[Transcript] = None - self.audio_segments: List[AudioSegment] = [] + self.audio_segments: List[PodcastsAudioSegment] = [] self.audio: Optional[PydubAudioSegment] = None # Define the sequence of methods to be called for each stage @@ -195,10 +195,14 @@ def build_audio_segments(self) -> None: @podcast_stage def stitch_audio_segments(self) -> None: """Stitch all audio segments together to form the final podcast audio.""" - self.audio = sum([segment.audio for segment in self.audio_segments]) + # order segments by filename + segments_to_stitch = sorted(self.audio_segments, key=lambda segment: segment.filepath) + + self.audio = sum((segment.audio for segment in segments_to_stitch), AudioSegment.empty()) def _build_next_stage(self) -> bool: """Build the next stage of the podcast.""" + print("state: ", self.state) if self.state == PodcastState.STITCHED: return False @@ -338,8 +342,8 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3") with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT): - new_segment = AudioSegment(Path(temp_file.name), 500, - TranscriptSegment("New audio segment", podcast.characters["Host"])) + new_segment = PodcastsAudioSegment(Path(temp_file.name), 500, + TranscriptSegment("New audio segment", podcast.characters["Host"])) podcast.audio_segments.insert(0, new_segment) # Save the final podcast From 8d689306266afae8f676a65e8dfda6be112714aa Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 17:10:19 +0200 Subject: [PATCH 11/49] commit before merge --- podcastfy/aiengines/llm/gemini_langchain.py | 199 +++++++++++++++++++- 1 file changed, 189 insertions(+), 10 deletions(-) diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py index 9380a4f6..23de41fb 100644 --- a/podcastfy/aiengines/llm/gemini_langchain.py +++ b/podcastfy/aiengines/llm/gemini_langchain.py @@ -10,6 +10,8 @@ import re from typing import Optional, Dict, Any, List, Tuple +from langchain_community.llms.llamafile import Llamafile +from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.output_parsers import StrOutputParser from langchain import hub @@ -22,7 +24,10 @@ logger = logging.getLogger(__name__) -class ContentGenerator: + + +class OldContentGenerator: + # note: to be deleted but stays around few days for reference and troubleshooting def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None): """ Initialize the ContentGenerator. @@ -34,7 +39,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = os.environ["GOOGLE_API_KEY"] = api_key self.config = load_config() self.content_generator_config = self.config.get('content_generator', {}) - + # Load default conversation config and update with custom config if provided self.config_conversation = load_conversation_config(conversation_config) @@ -44,13 +49,13 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = temperature=self.config_conversation.get('creativity', 0), max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192), ) - + #pick podcastfy prompt from langchain hub self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_')) self.ending_message = self.config.get('text_to_speech')['ending_message'] self.parser = StrOutputParser() - + self.chain = (self.prompt_template | self.llm | self.parser) def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str: @@ -69,8 +74,8 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = """ assert len(characters) == 2, "The number of characters should be 2 for this implementation" try: - - + + prompt_params = { "input_text": input_texts, "word_count": self.config_conversation.get('word_count'), @@ -85,19 +90,192 @@ def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = } self.response = self.chain.invoke(prompt_params) - + logger.info(f"Content generated successfully") - + if output_filepath: with open(output_filepath, 'w') as file: file.write(self.response) logger.info(f"Response content saved to {output_filepath}") - + return self.response except Exception as e: logger.error(f"Error generating content: {str(e)}") raise +class LLMBackend: + def __init__( + self, + is_local: bool, + temperature: float, + max_output_tokens: int, + model_name: str, + ): + """ + Initialize the LLMBackend. + + Args: + is_local (bool): Whether to use a local LLM or not. + temperature (float): The temperature for text generation. + max_output_tokens (int): The maximum number of output tokens. + model_name (str): The name of the model to use. + """ + self.is_local = is_local + self.temperature = temperature + self.max_output_tokens = max_output_tokens + self.model_name = model_name + self.is_multimodal = not is_local # Does not assume local LLM is multimodal + + if is_local: + self.llm = Llamafile() + else: + self.llm = ChatGoogleGenerativeAI( + model=model_name, + temperature=temperature, + max_output_tokens=max_output_tokens, + ) + + +class ContentGenerator: + def __init__( + self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None + ): + """ + Initialize the ContentGenerator. + + Args: + api_key (str): API key for Google's Generative AI. + conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. + """ + os.environ["GOOGLE_API_KEY"] = api_key + self.config = load_config() + self.content_generator_config = self.config.get("content_generator", {}) + + self.config_conversation = load_conversation_config(conversation_config) + + def __compose_prompt(self, num_images: int): + """ + Compose the prompt for the LLM based on the content list. + """ + prompt_template = hub.pull( + self.config.get("content_generator", {}).get( + "prompt_template", "souzatharsis/podcastfy_multimodal" + ) + ) + + image_path_keys = [] + messages = [] + text_content = {"type": "text", "text": "{input_text}"} + messages.append(text_content) + for i in range(num_images): + key = f"image_path_{i}" + image_content = { + "image_url": {"path": f"{{{key}}}", "detail": "high"}, + "type": "image_url", + } + image_path_keys.append(key) + messages.append(image_content) + + user_prompt_template = ChatPromptTemplate.from_messages( + messages=[HumanMessagePromptTemplate.from_template(messages)] + ) + + # Compose messages from podcastfy_prompt_template and user_prompt_template + combined_messages = prompt_template.messages + user_prompt_template.messages + + # Create a new ChatPromptTemplate object with the combined messages + composed_prompt_template = ChatPromptTemplate.from_messages(combined_messages) + + return composed_prompt_template, image_path_keys + + def __compose_prompt_params( + self, image_file_paths: List[str], image_path_keys: List[str], input_texts: str + ): + prompt_params = { + "input_text": input_texts, + "word_count": self.config_conversation.get("word_count"), + "conversation_style": ", ".join( + self.config_conversation.get("conversation_style", []) + ), + "roles_person1": self.config_conversation.get("roles_person1"), + "roles_person2": self.config_conversation.get("roles_person2"), + "dialogue_structure": ", ".join( + self.config_conversation.get("dialogue_structure", []) + ), + "podcast_name": self.config_conversation.get("podcast_name"), + "podcast_tagline": self.config_conversation.get("podcast_tagline"), + "output_language": self.config_conversation.get("output_language"), + "engagement_techniques": ", ".join( + self.config_conversation.get("engagement_techniques", []) + ), + } + + # for each image_path_key, add the corresponding image_file_path to the prompt_params + for key, path in zip(image_path_keys, image_file_paths): + prompt_params[key] = path + + return prompt_params + + def generate_qa_content( + self, + input_texts: str = "", + image_file_paths: List[str] = [], + output_filepath: Optional[str] = None, + is_local: bool = False, + ) -> str: + """ + Generate Q&A content based on input texts. + + Args: + input_texts (str): Input texts to generate content from. + image_file_paths (List[str]): List of image file paths. + output_filepath (Optional[str]): Filepath to save the response content. Defaults to None. + is_local (bool): Whether to use a local LLM or not. Defaults to False. + + Returns: + str: Formatted Q&A content. + + Raises: + Exception: If there's an error in generating content. + """ + try: + llmbackend = LLMBackend( + is_local=is_local, + temperature=self.config_conversation.get("creativity", 0), + max_output_tokens=self.content_generator_config.get( + "max_output_tokens", 8192 + ), + model_name=( + self.content_generator_config.get( + "gemini_model", "gemini-1.5-pro-latest" + ) + if not is_local + else "User provided model" + ), + ) + + num_images = 0 if is_local else len(image_file_paths) + self.prompt_template, image_path_keys = self.__compose_prompt(num_images) + self.parser = StrOutputParser() + self.chain = self.prompt_template | llmbackend.llm | self.parser + + prompt_params = self.__compose_prompt_params( + image_file_paths, image_path_keys, input_texts + ) + + self.response = self.chain.invoke(prompt_params) + + logger.info(f"Content generated successfully") + + if output_filepath: + with open(output_filepath, "w") as file: + file.write(self.response) + logger.info(f"Response content saved to {output_filepath}") + + return self.response + except Exception as e: + logger.error(f"Error generating content: {str(e)}") + raise class DefaultPodcastifyTranscriptEngine(LLMBackend): def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None): @@ -140,7 +318,8 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]: return processed_matches def generate_transcript(self, prompt: str, characters: List[Character]) -> List[Tuple[Character, str]]: - content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters) + content = self.content_generator.generate_qa_content(prompt, output_filepath=None) + # content = self.content_generator.generate_qa_content(prompt, output_filepath=None, characters=characters) # ideally in the future. q_a_pairs = self.split_qa(content) transcript = [] From fa83fc11967eb6b0a1b8aa6ced50e9205a07a5fa Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 18:06:10 +0200 Subject: [PATCH 12/49] support for local and ad other compat elements --- podcastfy/aiengines/llm/gemini_langchain.py | 5 +- podcastfy/client_v2.py | 253 ++++---------------- 2 files changed, 56 insertions(+), 202 deletions(-) diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py index dd0d20ad..bb92bd22 100644 --- a/podcastfy/aiengines/llm/gemini_langchain.py +++ b/podcastfy/aiengines/llm/gemini_langchain.py @@ -140,7 +140,7 @@ def __init__( class DefaultPodcastifyTranscriptEngine(LLMBackend): - def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None): + def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False): """ Initialize the DefaultPodcastifyTranscriptEngine. @@ -149,6 +149,7 @@ def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. """ self.content_generator = ContentGenerator(api_key, conversation_config) + self.is_local = is_local def split_qa(self, input_text: str) -> List[Tuple[str, str]]: """ @@ -182,7 +183,7 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]: def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]: image_file_paths = [c.value for c in content if c.type == 'image_path'] text_content = "\n\n".join(c.value for c in content if c.type == 'text') - content = self.content_generator.generate_qa_content(text_content, image_file_paths) # ideally in the future we pass characters here + content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here q_a_pairs = self.split_qa(content) transcript = [] diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index 0c79c5da..d8021439 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -13,6 +13,7 @@ from podcastfy.content_parser.content_extractor import ContentExtractor from podcastfy.core.tts_configs import TTSConfig from podcastfy.utils.config import Config, load_config +from podcastfy.utils.config_conversation import load_conversation_config from podcastfy.utils.logger import setup_logger logger = setup_logger(__name__) @@ -21,6 +22,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]: + # in the future, we should load this from the config file host = Character( name="Person1", role="Podcast host", @@ -72,225 +74,76 @@ def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBa def process_content_v2( urls: Optional[List[str]] = None, transcript_file: Optional[str] = None, - tts_model: str = "openai", + tts_model: str = "openai", # to be fixed, in case of characters, it should be a list of models generate_audio: bool = True, config: Optional[Config] = None, conversation_config: Optional[Dict[str, Any]] = None, image_paths: Optional[List[str]] = None, is_local: bool = False, ) -> Tuple[Optional[str], Podcast]: - if config is None: - config = load_config() - if urls is None: - urls = [] - characters = create_characters(config.config) - tts_backends = create_tts_backends(config) - if transcript_file: - logger.info(f"Using transcript file: {transcript_file}") - transcript = Transcript.load( - transcript_file, {char.name: char for char in characters} - ) - podcast = Podcast.from_transcript(transcript, tts_backends, characters) - else: - logger.info(f"Processing {len(urls)} links") - content_extractor = ContentExtractor(config.JINA_API_KEY) - content_generator = DefaultPodcastifyTranscriptEngine( - config.GEMINI_API_KEY, conversation_config, is_local=is_local - ) - - contents = [content_extractor.extract_content(url) for url in urls] - llm_contents = [] - if contents: - llm_contents.append(LLMContent(value="\n\n".join(contents), type="text")) - if image_paths: - llm_contents.extend( - [LLMContent(value=image_path, type="image_path") for image_path in image_paths] - ) - - - - podcast = Podcast( - content=llm_contents, - llm_backend=content_generator, - tts_backends=tts_backends, - characters=characters, - ) - - - if generate_audio: - podcast.finalize() - else: - podcast.build_transcript() - - return podcast - - -@app.command() -def main( - urls: List[str] = typer.Option(None, "--url", "-u", help="URLs to process"), - file: typer.FileText = typer.Option( - None, "--file", "-f", help="File containing URLs, one per line" - ), - transcript: typer.FileText = typer.Option( - None, "--transcript", "-t", help="Path to a transcript file" - ), - tts_model: str = typer.Option( - None, "--tts-model", "-tts", help="TTS model to use (openai or elevenlabs)" - ), - transcript_only: bool = typer.Option( - False, "--transcript-only", help="Generate only a transcript without audio" - ), - conversation_config: str = typer.Option( - None, - "--conversation-config", - "-cc", - help="Path to custom conversation configuration YAML file", - ), - output_dir: str = typer.Option( - "./output", "--output-dir", "-o", help="Directory to save output files" - ), -): - """ - Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. - """ try: - config = load_config() - main_config = config.config.get("main", {}) - if tts_model is None: - tts_model = main_config.get("default_tts_model", "openai") - - urls_list = urls or [] - if file: - urls_list.extend([line.strip() for line in file if line.strip()]) - - if not urls_list and not transcript: - raise typer.BadParameter( - "No URLs or transcript provided. Use --url to specify URLs, --file to specify a file containing URLs, or --transcript for a transcript file." + if config is None: + config = load_config() + if urls is None: + urls = [] + if config is None: + config = load_config() + # Load default conversation config + conv_config = load_conversation_config() + + # Update with provided config if any + if conversation_config: + conv_config.configure(conversation_config) + characters = create_characters(conv_config.config_conversation) + tts_backends = create_tts_backends(config) + # filter out the tts backends that are not in the tts_model, temporary solution + tts_backends = [tts for tts in tts_backends if tts.name != tts_model] + if transcript_file: + logger.info(f"Using transcript file: {transcript_file}") + transcript = Transcript.load( + transcript_file, {char.name: char for char in characters} ) - - podcast = process_links( - urls_list, - transcript_file=transcript.name if transcript else None, - tts_model=tts_model, - generate_audio=not transcript_only, - config=config, - conversation_config=conversation_config, - ) - - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - if transcript_only: - transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt" - podcast.export_transcript(str(transcript_file)) - typer.echo(f"Transcript generated successfully: {transcript_file}") + podcast = Podcast.from_transcript(transcript, tts_backends, characters) else: - audio_file = output_dir / f"podcast_{uuid.uuid4().hex}.mp3" - podcast.save(str(audio_file)) - transcript_file = output_dir / f"transcript_{uuid.uuid4().hex}.txt" - podcast.export_transcript(str(transcript_file)) - typer.echo( - f"Podcast generated successfully using {tts_model} TTS model: {audio_file}" + logger.info(f"Processing {len(urls)} links") + content_extractor = ContentExtractor() + content_generator = DefaultPodcastifyTranscriptEngine( + config.GEMINI_API_KEY, conversation_config, is_local=is_local ) - typer.echo(f"Transcript saved to: {transcript_file}") - - except Exception as e: - typer.echo(f"An error occurred: {str(e)}", err=True) - raise typer.Exit(code=1) + contents = [content_extractor.extract_content(url) for url in urls] + llm_contents = [] + if contents: + llm_contents.append(LLMContent(value="\n\n".join(contents), type="text")) + if image_paths: + llm_contents.extend( + [LLMContent(value=image_path, type="image_path") for image_path in image_paths] + ) -if __name__ == "__main__": - app() - - -def generate_podcast( - urls: Optional[List[str]] = None, - url_file: Optional[str] = None, - transcript_file: Optional[str] = None, - tts_model: Optional[str] = None, - transcript_only: bool = False, - config: Optional[Dict[str, Any]] = None, - conversation_config: Optional[Dict[str, Any]] = None, -) -> Podcast: - """ - Generate a podcast or transcript from a list of URLs, a file containing URLs, or a transcript file. - - Args: - urls (Optional[List[str]]): List of URLs to process. - url_file (Optional[str]): Path to a file containing URLs, one per line. - transcript_file (Optional[str]): Path to a transcript file. - tts_model (Optional[str]): TTS model to use ('openai' or 'elevenlabs'). - transcript_only (bool): Generate only a transcript without audio. Defaults to False. - config (Optional[Dict[str, Any]]): User-provided configuration dictionary. - conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary. - - Returns: - Podcast: An instance of the Podcast class representing the generated podcast. - - Example: - >>> from podcastfy.client_v2 import generate_podcast - >>> podcast = generate_podcast( - ... urls=['https://example.com/article1', 'https://example.com/article2'], - ... tts_model='elevenlabs', - ... config={ - ... 'main': { - ... 'default_tts_model': 'elevenlabs' - ... }, - ... 'output_directories': { - ... 'audio': '/custom/path/to/audio', - ... 'transcripts': '/custom/path/to/transcripts' - ... } - ... }, - ... conversation_config={ - ... 'word_count': 150, - ... 'conversation_style': ['informal', 'friendly'], - ... 'podcast_name': 'My Custom Podcast' - ... } - ... ) - >>> podcast.save('/path/to/output.mp3') - >>> podcast.export_transcript('/path/to/transcript.txt') - """ - try: - default_config = load_config() - if config: - if isinstance(config, dict): - updated_config = Config() - updated_config.configure(**config) - default_config = updated_config - elif isinstance(config, Config): - default_config = config - else: - raise ValueError( - "Config must be either a dictionary or a Config object" - ) - main_config = default_config.config.get("main", {}) + podcast = Podcast( + content=llm_contents, + llm_backend=content_generator, + tts_backends=tts_backends, + characters=characters, + ) - if tts_model is None: - tts_model = main_config.get("default_tts_model", "openai") - urls_list = urls or [] - if url_file: - with open(url_file, "r") as file: - urls_list.extend([line.strip() for line in file if line.strip()]) + if generate_audio: + podcast.finalize() - if not urls_list and not transcript_file: - raise ValueError( - "No URLs or transcript provided. Please provide either 'urls', 'url_file', or 'transcript_file'." + # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object + random_filename = f"podcast_{uuid.uuid4().hex}.mp3" + audio_file = os.path.join( + config.get("output_directories")["audio"], random_filename ) - - podcast = process_links( - urls_list, - transcript_file=transcript_file, - tts_model=tts_model, - generate_audio=not transcript_only, - config=default_config, - conversation_config=conversation_config, - ) + podcast.save(filepath=audio_file) + return audio_file + else: + podcast.build_transcript() return podcast - except Exception as e: - logger.error(f"An error occurred: {str(e)}") + logger.error(f"An error occurred in the process_content function: {str(e)}") raise From 08cccc1647b6e34274f5c479c5143422a1357bea Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 18:08:38 +0200 Subject: [PATCH 13/49] ending message --- podcastfy/content_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py index 01502aa2..9ff9f0af 100644 --- a/podcastfy/content_generator.py +++ b/podcastfy/content_generator.py @@ -71,6 +71,7 @@ def __init__( self.content_generator_config = self.config.get("content_generator", {}) self.config_conversation = load_conversation_config(conversation_config) + self.ending_message = self.config_conversation.get('text_to_speech')['ending_message'] def __compose_prompt(self, num_images: int): """ From 0eed1d4f440c17cf09816e2ef8a998ff8fd66a8a Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 18:17:00 +0200 Subject: [PATCH 14/49] two fixes --- podcastfy/aiengines/tts/tts_backends.py | 2 +- podcastfy/client_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 1e4d4125..930b7224 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -46,7 +46,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat model=config.extra_args.get('model', self.get_default_config().get('model', 'default')) ) with open(output_path, "wb") as out: - for chunk in content: + async for chunk in content: if chunk: out.write(chunk) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index d8021439..a9c58733 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -97,7 +97,7 @@ def process_content_v2( characters = create_characters(conv_config.config_conversation) tts_backends = create_tts_backends(config) # filter out the tts backends that are not in the tts_model, temporary solution - tts_backends = [tts for tts in tts_backends if tts.name != tts_model] + tts_backends = [tts for tts in tts_backends if tts.name == tts_model] if transcript_file: logger.info(f"Using transcript file: {transcript_file}") transcript = Transcript.load( From cd1141c565526c4da82f0f97286693159436ba62 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 18:19:57 +0200 Subject: [PATCH 15/49] fix threads --- podcastfy/core/audio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index 2e57f394..f619bc61 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -83,7 +83,8 @@ def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: - return list(executor.map(process_segment, transcript.segments)) + return list(executor.map(process_segment, + ((segment, i) for i, segment in enumerate(transcript.segments)))) def create_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: if self.has_async_backend: From 38db311e104135a7646be9d3423aa27172b74647 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 18:59:07 +0200 Subject: [PATCH 16/49] fix incorrect default path for configs --- podcastfy/aiengines/tts/base.py | 2 +- podcastfy/aiengines/tts/tts_backends.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py index 7b88c290..8a8ded3c 100644 --- a/podcastfy/aiengines/tts/base.py +++ b/podcastfy/aiengines/tts/base.py @@ -51,7 +51,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat class TTSConfigMixin: """Mixin class to manage TTS external configurations.""" - def __init__(self, config_file: str = 'podcastfy/config.yaml', name: str = "") -> None: + def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None: # TODO: probably bad config files for final client self.name = name self.config_file = config_file diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 930b7224..1d0f7700 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -18,7 +18,7 @@ class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin): name: str = "elevenlabs" - def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'): + def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'): # TODO: not the right path for final client TTSConfigMixin.__init__(self, config_file, name=self.name) self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") @@ -76,7 +76,7 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> class EdgeTTS(AsyncTTSBackend, TTSConfigMixin): name: str = "edge" - def __init__(self, config_file: str = 'podcastfy/config.yaml'): + def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml'): TTSConfigMixin.__init__(self, config_file, name=self.name) async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None: From 54e046b2062fb2fe64c3d0980f7004a107394f8c Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 22:14:52 +0200 Subject: [PATCH 17/49] better naming and fix an import --- podcastfy/aiengines/llm/base.py | 4 ++-- podcastfy/aiengines/llm/gemini_langchain.py | 4 ++-- podcastfy/aiengines/tts/base.py | 4 +++- podcastfy/client_v2.py | 21 ++++++++++++------- podcastfy/core/{llm_content.py => content.py} | 2 +- podcastfy/core/podcast.py | 18 ++++++++-------- 6 files changed, 31 insertions(+), 22 deletions(-) rename podcastfy/core/{llm_content.py => content.py} (83%) diff --git a/podcastfy/aiengines/llm/base.py b/podcastfy/aiengines/llm/base.py index 7223bb2d..071f79fe 100644 --- a/podcastfy/aiengines/llm/base.py +++ b/podcastfy/aiengines/llm/base.py @@ -2,7 +2,7 @@ from typing import List, Tuple from podcastfy.core.character import Character -from podcastfy.core.llm_content import LLMContent +from podcastfy.core.content import Content class LLMBackend(ABC): @@ -10,7 +10,7 @@ class LLMBackend(ABC): # TODO a nice mixin/helper could be made to load prompt templates from conf file (both podcast settings and character settings) @abstractmethod - def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]: + def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]: """ Generate text based on a given prompt. diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py index bb92bd22..ebd09e1f 100644 --- a/podcastfy/aiengines/llm/gemini_langchain.py +++ b/podcastfy/aiengines/llm/gemini_langchain.py @@ -19,7 +19,7 @@ from podcastfy.content_generator import ContentGenerator from podcastfy.core.character import Character from podcastfy.aiengines.llm.base import LLMBackend -from podcastfy.core.llm_content import LLMContent +from podcastfy.core.content import Content from podcastfy.utils.config_conversation import load_conversation_config from podcastfy.utils.config import load_config import logging @@ -180,7 +180,7 @@ def split_qa(self, input_text: str) -> List[Tuple[str, str]]: ] return processed_matches - def generate_transcript(self, content: List[LLMContent], characters: List[Character]) -> List[Tuple[Character, str]]: + def generate_transcript(self, content: List[Content], characters: List[Character]) -> List[Tuple[Character, str]]: image_file_paths = [c.value for c in content if c.type == 'image_path'] text_content = "\n\n".join(c.value for c in content if c.type == 'text') content = self.content_generator.generate_qa_content(text_content, image_file_paths, is_local=self.is_local) # ideally in the future we pass characters here diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py index 8a8ded3c..bcda17a3 100644 --- a/podcastfy/aiengines/tts/base.py +++ b/podcastfy/aiengines/tts/base.py @@ -1,12 +1,14 @@ from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, Any, List +from typing import Dict, Any, List, Union import yaml from podcastfy.core.character import Character from podcastfy.core.tts_configs import TTSConfig +TTSBackend = Union["SyncTTSBackend", "AsyncTTSBackend"] + class SyncTTSBackend(ABC): """Protocol for synchronous Text-to-Speech backends.""" diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index a9c58733..c5cb62d6 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -5,9 +5,10 @@ from typing import List, Optional, Dict, Any, Union, Tuple from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine +from podcastfy.aiengines.tts.base import TTSBackend from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS from podcastfy.core.character import Character -from podcastfy.core.llm_content import LLMContent +from podcastfy.core.content import Content from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend from podcastfy.core.transcript import Transcript from podcastfy.content_parser.content_extractor import ContentExtractor @@ -62,7 +63,7 @@ def create_characters(config: Dict[str, Any]) -> List[Character]: return [host, guest] -def create_tts_backends(config: Config) -> List[Union[SyncTTSBackend, AsyncTTSBackend]]: +def create_tts_backends(config: Config) -> List[TTSBackend]: return [ OpenAITTS(api_key=config.OPENAI_API_KEY), ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY), @@ -95,9 +96,7 @@ def process_content_v2( if conversation_config: conv_config.configure(conversation_config) characters = create_characters(conv_config.config_conversation) - tts_backends = create_tts_backends(config) - # filter out the tts backends that are not in the tts_model, temporary solution - tts_backends = [tts for tts in tts_backends if tts.name == tts_model] + tts_backends = obtain_tts_backend(config, tts_model) if transcript_file: logger.info(f"Using transcript file: {transcript_file}") transcript = Transcript.load( @@ -114,10 +113,10 @@ def process_content_v2( contents = [content_extractor.extract_content(url) for url in urls] llm_contents = [] if contents: - llm_contents.append(LLMContent(value="\n\n".join(contents), type="text")) + llm_contents.append(Content(value="\n\n".join(contents), type="text")) if image_paths: llm_contents.extend( - [LLMContent(value=image_path, type="image_path") for image_path in image_paths] + [Content(value=image_path, type="image_path") for image_path in image_paths] ) @@ -147,3 +146,11 @@ def process_content_v2( except Exception as e: logger.error(f"An error occurred in the process_content function: {str(e)}") raise + + +def obtain_tts_backend(config, tts_model): + # temporary solution + tts_backends = create_tts_backends(config) + # filter out the tts backends that are not in the tts_model, temporary solution + tts_backends = [tts for tts in tts_backends if tts.name == tts_model] + return tts_backends diff --git a/podcastfy/core/llm_content.py b/podcastfy/core/content.py similarity index 83% rename from podcastfy/core/llm_content.py rename to podcastfy/core/content.py index d9ecfe54..3fc6d704 100644 --- a/podcastfy/core/llm_content.py +++ b/podcastfy/core/content.py @@ -4,6 +4,6 @@ # we can do much better here, but for now, let's keep it simple -class LLMContent(BaseModel): +class Content(BaseModel): value: Any type: str \ No newline at end of file diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index f6300421..7a60e8fb 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -8,10 +8,10 @@ from contextlib import contextmanager from podcastfy.aiengines.llm.base import LLMBackend -from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend +from podcastfy.aiengines.tts.base import SyncTTSBackend, AsyncTTSBackend, TTSBackend from podcastfy.core.audio import PodcastsAudioSegment, AudioManager from podcastfy.core.character import Character -from podcastfy.core.llm_content import LLMContent +from podcastfy.core.content import Content from podcastfy.core.transcript import TranscriptSegment, Transcript from podcastfy.core.tts_configs import TTSConfig @@ -55,8 +55,8 @@ def wrapper(self, *args, **kwargs): class Podcast: """Main class for podcast creation and management.""" - def __init__(self, content: List[LLMContent], llm_backend: LLMBackend, - tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], audio_temp_dir: Optional[Union[str, Path]] = None, + def __init__(self, content: List[Content], llm_backend: LLMBackend, + tts_backends: List[TTSBackend], audio_temp_dir: Optional[Union[str, Path]] = None, characters: Optional[List[Character]] = None, default_tts_n_jobs: int = 1) -> None: """ @@ -65,7 +65,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend, Args: content (str): The raw content to be processed into a podcast. llm_backend (LLMBackend): The language model backend for generating the transcript. - tts_backends (List[Union[SyncTTSBackend, AsyncTTSBackend]]): List of available TTS backends. + tts_backends (List[TTSBackend]): List of available TTS backends. audio_temp_dir (Optional[str]): Path to a temporary directory for audio files. If None, a temporary directory will be created. characters (List[Character]): List of characters participating in the podcast. @@ -77,7 +77,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend, """ self.content = content self.llm_backend = llm_backend - self.tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]] = {backend.name: backend for backend in tts_backends} + self.tts_backends: Dict[str, TTSBackend] = {backend.name: backend for backend in tts_backends} self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])} self.default_tts_n_jobs = default_tts_n_jobs self.state = PodcastState.INITIALIZED @@ -95,7 +95,7 @@ def __init__(self, content: List[LLMContent], llm_backend: LLMBackend, # Initialize attributes with null values self.transcript: Optional[Transcript] = None self.audio_segments: List[PodcastsAudioSegment] = [] - self.audio: Optional[PydubAudioSegment] = None + self.audio: Optional[AudioSegment] = None # Define the sequence of methods to be called for each stage self._next_stage_methods: Dict[PodcastState, Callable[[], None]] = { @@ -268,7 +268,7 @@ def __init__(self, name: str): self.name = name def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: - audio = PydubAudioSegment.silent(duration=1000) + audio = AudioSegment.silent(duration=1000) audio.export(str(output_path), format="mp3") return output_path @@ -338,7 +338,7 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> # Add a new audio segment (auto_finalize is True by default) with NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: - PydubAudioSegment.silent(duration=500).export(temp_file.name, format="mp3") + AudioSegment.silent(duration=500).export(temp_file.name, format="mp3") with podcast.rework(PodcastState.AUDIO_SEGMENTS_BUILT): new_segment = PodcastsAudioSegment(Path(temp_file.name), 500, From a33e2f879c0aaa53d7f6b8afdc75df0a88387c88 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 22:15:09 +0200 Subject: [PATCH 18/49] fix argument type --- podcastfy/aiengines/tts/tts_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 1d0f7700..37cbf65c 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -82,7 +82,7 @@ def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml'): async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None: config = self.tts_config_for_character(character) communicate = edge_tts.Communicate(text, config.voice) - await communicate.save(output_path) + await communicate.save(str(output_path)) From afbe769e8198d619e2cdef52708526a45ab1be53 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 22:15:30 +0200 Subject: [PATCH 19/49] more compat --- podcastfy/client_v2.py | 4 ++-- podcastfy/content_generator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index c5cb62d6..088b8d20 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -138,11 +138,11 @@ def process_content_v2( config.get("output_directories")["audio"], random_filename ) podcast.save(filepath=audio_file) - return audio_file + return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file else: podcast.build_transcript() - return podcast + return None # note: should return the podcast object instead, but for the sake of the tests, we return None except Exception as e: logger.error(f"An error occurred in the process_content function: {str(e)}") raise diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py index 9ff9f0af..5f3c190f 100644 --- a/podcastfy/content_generator.py +++ b/podcastfy/content_generator.py @@ -71,7 +71,7 @@ def __init__( self.content_generator_config = self.config.get("content_generator", {}) self.config_conversation = load_conversation_config(conversation_config) - self.ending_message = self.config_conversation.get('text_to_speech')['ending_message'] + self.ending_message = self.config_conversation.get('text_to_speech').get('ending_message','') def __compose_prompt(self, num_images: int): """ From 267a3590f81f5a26cf6835c57d258f3d15b31bb1 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 22:15:43 +0200 Subject: [PATCH 20/49] add interogation --- podcastfy/core/transcript.py | 1 + 1 file changed, 1 insertion(+) diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index 952fa2be..e2baec2d 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -56,6 +56,7 @@ def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]: @classmethod def load(cls, filepath: str, characters: Dict[str, Character]) -> 'Transcript': """Load a transcript from a JSON file.""" + # There are a loss of characters informations when loading a transcript, is it acceptable? with open(filepath, 'r') as f: content = f.read() From 6084e41f6e6b3672567c36128824722be41eeddf Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 22:15:54 +0200 Subject: [PATCH 21/49] fix test --- tests/test_generate_podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generate_podcast.py b/tests/test_generate_podcast.py index b9699e6f..811d848f 100644 --- a/tests/test_generate_podcast.py +++ b/tests/test_generate_podcast.py @@ -77,7 +77,7 @@ def test_generate_podcast_from_transcript_file(sample_config): # First, generate a transcript transcript_file = os.path.join(sample_config.get('output_directories', {}).get('transcripts'), 'test_transcript.txt') with open(transcript_file, 'w') as f: - f.write("Joe Biden and the US PoliticsJoe Biden is the current president of the United States of America") + f.write("Joe Biden and the US PoliticsJoe Biden is the current president of the United States of America") # Now use this transcript to generate a podcast audio_file = generate_podcast( From 91b726b92c96d02a437e8851dcc22b6772d088bd Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 23:14:34 +0200 Subject: [PATCH 22/49] add todo temp --- must_do_before_merge.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 must_do_before_merge.txt diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt new file mode 100644 index 00000000..b7f12b1f --- /dev/null +++ b/must_do_before_merge.txt @@ -0,0 +1,5 @@ +- one test or two on the Podcast Class +- delete client_v2 and merge it with client +- check that all config options are taken +- remove the excessive prints +- ... ? From 5e633aa9a2cd1893bf924e45c902ff823e48aec5 Mon Sep 17 00:00:00 2001 From: bruno Date: Wed, 16 Oct 2024 23:15:16 +0200 Subject: [PATCH 23/49] add todo temp --- must_do_before_merge.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt index b7f12b1f..f845252a 100644 --- a/must_do_before_merge.txt +++ b/must_do_before_merge.txt @@ -2,4 +2,5 @@ - delete client_v2 and merge it with client - check that all config options are taken - remove the excessive prints +- check that all tts work - ... ? From 96e7db4d58b2afea7f9eb1e64141485d0cac57e3 Mon Sep 17 00:00:00 2001 From: Tharsis Souza Date: Wed, 16 Oct 2024 18:30:24 -0300 Subject: [PATCH 24/49] Update must_do_before_merge.txt --- must_do_before_merge.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt index f845252a..24614f0b 100644 --- a/must_do_before_merge.txt +++ b/must_do_before_merge.txt @@ -4,3 +4,5 @@ - remove the excessive prints - check that all tts work - ... ? +- 100% of current pytest unit tests pass +- 100% of of CLI case scenarios from usage/cli.md From 9703997f0c18ade69fd095f6bbed472b12446940 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 07:35:17 +0200 Subject: [PATCH 25/49] tests the podcast class --- must_do_before_merge.txt | 2 +- podcastfy/core/podcast.py | 4 +- podcastfy/core/transcript.py | 2 +- tests/test_core_api.py | 168 +++++++++++++++++++++++++++++++++++ 4 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 tests/test_core_api.py diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt index 24614f0b..38e14f3f 100644 --- a/must_do_before_merge.txt +++ b/must_do_before_merge.txt @@ -4,5 +4,5 @@ - remove the excessive prints - check that all tts work - ... ? -- 100% of current pytest unit tests pass +- 100% of current pytest unit tests pass [x] (except for test_generate_podcast_with_custom_config, exhausted credits) - 100% of of CLI case scenarios from usage/cli.md diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 7a60e8fb..6946e138 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -226,7 +226,7 @@ def save(self, filepath: str) -> None: def export_transcript(self, filepath: str, format_: str = "plaintext") -> None: """Save the podcast transcript to a file.""" - if self.state < PodcastState.TRANSCRIPT_BUILT: + if self.state.value < PodcastState.TRANSCRIPT_BUILT.value: raise ValueError("Transcript can only be saved after it is built") if self.transcript: @@ -236,7 +236,7 @@ def export_transcript(self, filepath: str, format_: str = "plaintext") -> None: def dump_transcript(self, filepath: str) -> None: """Dump the podcast transcript to a JSON file.""" - if self.state < PodcastState.TRANSCRIPT_BUILT: + if self.state.value < PodcastState.TRANSCRIPT_BUILT.value: raise ValueError("Transcript can only be dumped after it is built") if self.transcript: diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index e2baec2d..7cbac381 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -28,7 +28,7 @@ def from_dict(cls, data: Dict[str, Any], characters: Dict[str, Character]) -> 'T class Transcript: - def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any]) -> None: + def __init__(self, segments: List[TranscriptSegment], metadata: Dict[str, Any] = {}) -> None: self.segments = segments self.metadata = metadata diff --git a/tests/test_core_api.py b/tests/test_core_api.py new file mode 100644 index 00000000..fba450d1 --- /dev/null +++ b/tests/test_core_api.py @@ -0,0 +1,168 @@ +"""Tests for the core API of the podcastfy package. Not e2e tests as DummyTTSBackend is used to simulate the TTS backend and DummyLLMBackend is used to simulate the LLM backend.""" +import pytest +from pathlib import Path +from pydub import AudioSegment + +from podcastfy.core.content import Content +from podcastfy.core.podcast import Podcast, PodcastState +from podcastfy.aiengines.llm.base import LLMBackend +from podcastfy.aiengines.tts.base import SyncTTSBackend +from podcastfy.core.character import Character +from podcastfy.core.tts_configs import TTSConfig +from podcastfy.core.transcript import TranscriptSegment, Transcript + + +class DummyLLMBackend(LLMBackend): + def generate_transcript(self, content, characters): + return [ + (characters[0], "Welcome to our podcast!"), + (characters[1], "Thanks for having me!") + ] + + +class DummyTTSBackend(SyncTTSBackend): + def __init__(self, name: str): + self.name = name + + def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + audio = AudioSegment.silent(duration=1000) + audio.export(str(output_path), format="mp3") + return output_path + + +@pytest.fixture +def tts_backends(): + return [DummyTTSBackend("openai"), DummyTTSBackend("elevenlabs")] + + +@pytest.fixture +def characters(): + host = Character( + name="Host", + role="Podcast host", + tts_configs={ + "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}), + "elevenlabs": TTSConfig(voice="Rachel", backend="elevenlabs", extra_args={"stability": 0.5}) + }, + default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly." + ) + + guest = Character( + name="Guest", + role="Expert guest", + tts_configs={ + "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}), + "elevenlabs": TTSConfig(voice="Antoni", backend="elevenlabs", extra_args={"stability": 0.8}) + }, + default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner." + ) + + return [host, guest] + + +@pytest.fixture +def podcast(tts_backends, characters): + return Podcast( + content=[Content(value="This is a sample content for our podcast.", type="text")], + llm_backend=DummyLLMBackend(), + tts_backends=tts_backends, + characters=characters, + ) + + +def test_podcast_initialization(podcast): + assert podcast.state == PodcastState.INITIALIZED + assert podcast.transcript is None + assert podcast.audio_segments == [] + assert podcast.audio is None + + +def test_build_transcript(podcast): + podcast.build_transcript() + assert podcast.state == PodcastState.TRANSCRIPT_BUILT + assert isinstance(podcast.transcript, Transcript) + assert len(podcast.transcript.segments) == 2 + + +def test_build_audio_segments(podcast): + podcast.build_transcript() + podcast.build_audio_segments() + assert podcast.state == PodcastState.AUDIO_SEGMENTS_BUILT + assert len(podcast.audio_segments) == 2 + + +def test_stitch_audio_segments(podcast): + podcast.build_transcript() + podcast.build_audio_segments() + podcast.stitch_audio_segments() + assert podcast.state == PodcastState.STITCHED + assert isinstance(podcast.audio, AudioSegment) + + +def test_finalize(podcast): + podcast.finalize() + assert podcast.state == PodcastState.STITCHED + assert isinstance(podcast.transcript, Transcript) + assert len(podcast.audio_segments) > 0 + assert isinstance(podcast.audio, AudioSegment) + + +def test_save(podcast, tmp_path): + podcast.finalize() + output_file = tmp_path / "test_podcast.mp3" + podcast.save(str(output_file)) + assert output_file.exists() + + +def test_export_transcript(podcast, tmp_path): + podcast.finalize() + output_file = tmp_path / "test_transcript.txt" + podcast.export_transcript(str(output_file), format_="plaintext") + assert output_file.exists() + + +def test_rework(podcast): + podcast.finalize() + + with podcast.rework(PodcastState.TRANSCRIPT_BUILT): + assert podcast.state == PodcastState.TRANSCRIPT_BUILT + podcast.transcript.segments.append( + TranscriptSegment("This is a new segment", podcast.characters["Host"])) + + assert podcast.state == PodcastState.STITCHED + assert len(podcast.transcript.segments) == 3 + + +def test_from_transcript(tts_backends, characters): + pre_existing_transcript = [ + ("Host", "Welcome to our podcast created from a pre-existing transcript!"), + ("Guest", "Thank you for having me. I'm excited to be here.") + ] + + podcast = Podcast.from_transcript( + transcript=Transcript([ + TranscriptSegment(text, characters[0] if speaker == "Host" else characters[1]) + for speaker, text in pre_existing_transcript + ]), + tts_backends=tts_backends, + characters=characters + ) + + assert podcast.state == PodcastState.TRANSCRIPT_BUILT + assert len(podcast.transcript.segments) == 2 + + podcast.finalize() + assert podcast.state == PodcastState.STITCHED + + +def test_load_transcript(tts_backends, characters, tmp_path): + # Create a dummy transcript file + transcript_file = tmp_path / "test_transcript.json" + Transcript([ + TranscriptSegment("Welcome to our podcast!", characters[0]), + TranscriptSegment("Thank you for having me!", characters[1]) + ]).dump(str(transcript_file)) + + podcast = Podcast.load_transcript(str(transcript_file), tts_backends, characters) + assert podcast.state == PodcastState.TRANSCRIPT_BUILT + assert len(podcast.transcript.segments) == 2 \ No newline at end of file From 317c7311edbe967cefba7a9092c47e8b6fe2b8aa Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 16:29:39 +0200 Subject: [PATCH 26/49] add compat with transcript saving --- podcastfy/client_v2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index 088b8d20..d6c28fae 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -134,9 +134,11 @@ def process_content_v2( # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object random_filename = f"podcast_{uuid.uuid4().hex}.mp3" + directories = config.get("output_directories") audio_file = os.path.join( - config.get("output_directories")["audio"], random_filename + directories["audio"], random_filename ) + podcast.transcript.export(directories["transcripts"]) podcast.save(filepath=audio_file) return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file else: From 8fb7aa3e1f4673a9729be49c7dfddfdf20d54498 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 19:15:20 +0200 Subject: [PATCH 27/49] fix bug and signature of TTS --- podcastfy/aiengines/tts/base.py | 4 ++-- podcastfy/core/audio.py | 7 ++++--- tests/test_transcript.py | 0 3 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 tests/test_transcript.py diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py index bcda17a3..f251a8b2 100644 --- a/podcastfy/aiengines/tts/base.py +++ b/podcastfy/aiengines/tts/base.py @@ -16,7 +16,7 @@ class SyncTTSBackend(ABC): name: str @abstractmethod - def text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + def text_to_speech(self, text: str, character: Character, output_path: Path) -> None: """ Convert text to speech synchronously. @@ -37,7 +37,7 @@ class AsyncTTSBackend(ABC): name: str @abstractmethod - async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> Path: + async def async_text_to_speech(self, text: str, character: Character, output_path: Path) -> None: """ Convert text to speech asynchronously. diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index f619bc61..663f9d59 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -74,12 +74,13 @@ def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAud def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): segment, index = segment_tuple tts_backend = self.get_tts_backend(segment) - audio_file = cast(SyncTTSBackend, tts_backend).text_to_speech( + filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3" + cast(SyncTTSBackend, tts_backend).text_to_speech( segment.text, segment.speaker, - Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3" + filepath ) - return PodcastsAudioSegment(audio_file, segment) + return PodcastsAudioSegment(filepath, segment) with ThreadPoolExecutor(max_workers=self.n_jobs) as executor: diff --git a/tests/test_transcript.py b/tests/test_transcript.py new file mode 100644 index 00000000..e69de29b From 9dcfeda565185ae350933e4370458a170f713b3f Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 19:15:57 +0200 Subject: [PATCH 28/49] clean markup at TranscriptSegment place --- podcastfy/core/transcript.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index 7cbac381..eec29bf6 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -5,9 +5,39 @@ from podcastfy.core.character import Character +def clean_markups(input_text: str) -> str: + """ + Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. + + Args: + input_text (str): The input text containing TSS markup tags. + + Returns: + str: Cleaned text with unsupported TSS markup tags removed. + """ + # List of SSML tags supported by both OpenAI and ElevenLabs + supported_tags = [ + 'speak', 'lang', 'p', 'phoneme', + 's', 'say-as', 'sub' + ] + # Append additional tags to the supported tags list + # Create a pattern that matches any tag not in the supported list + pattern = r']+>' + + # Remove unsupported tags + cleaned_text = re.sub(pattern, '', input_text) + + # Remove any leftover empty lines + cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text) + cleaned_text = cleaned_text.replace('(scratchpad)', '') + return cleaned_text + + class TranscriptSegment: - def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None) -> None: - self.text = text + def __init__(self, text: str, speaker: Character, + tts_args: Optional[Dict[str, Any]] = None, + auto_clean_markup=True) -> None: + self.text = clean_markups(text) if auto_clean_markup else text self.speaker = speaker self.tts_args = tts_args or {} From 5573adcbd5882aed282d1a14f7f674657780aad2 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 19:17:13 +0200 Subject: [PATCH 29/49] save transcript automatically for compat sake --- podcastfy/client_v2.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index d6c28fae..7f598ebc 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -133,12 +133,14 @@ def process_content_v2( podcast.finalize() # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object - random_filename = f"podcast_{uuid.uuid4().hex}.mp3" + random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" + random_filename_mp3 = f"{random_filename_no_suffix}.mp3" + random_filename_transcript = f"{random_filename_no_suffix}.txt" directories = config.get("output_directories") audio_file = os.path.join( - directories["audio"], random_filename + directories["audio"], random_filename_mp3 ) - podcast.transcript.export(directories["transcripts"]) + podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) podcast.save(filepath=audio_file) return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file else: From 7454ea33465e34f0a9090a3a5691a255f1339905 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 19:17:36 +0200 Subject: [PATCH 30/49] better print --- podcastfy/core/podcast.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 6946e138..2e0c40ff 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -34,7 +34,8 @@ def probably_same_func(method, func): @wraps(func) def wrapper(self, *args, **kwargs): current_method = self._next_stage_methods[self.state] - print(f"Executing {func.__name__} in state {self.state.name}") + print(f"Current state: {self.state.name}") + print(f"Executing: {func.__name__}") if not probably_same_func(current_method, func) and not self._reworking: print(f"Cannot execute {func.__name__} in current state {self.state.name}. Skipping.") raise Exception(f"Cannot execute {func.__name__} in current state {self.state.name}") @@ -43,7 +44,7 @@ def wrapper(self, *args, **kwargs): result = func(self, *args, **kwargs) next_state = PodcastState(self.state.value + 1) self.state = next_state or self.state - print(f"Transitioned to state {self.state.name}") + print(f"Done! Current State: {self.state.name}") return result except Exception as e: print(f"Error in {func.__name__}: {str(e)}") @@ -178,6 +179,9 @@ def build_transcript(self) -> None: if speaker.name in self.characters: tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {})) segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config)) + else: + print(f"Invalid segment: {segment}") + continue # If the segment doesn't match the expected format, we'll skip it self.transcript = Transcript(segments, {"source": "Generated content"}) From 034b19343fe7b246ceaf8d8da9d2b6c63f04fdff Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 19:38:46 +0200 Subject: [PATCH 31/49] tests, but one fails --- tests/test_transcript.py | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/test_transcript.py b/tests/test_transcript.py index e69de29b..cb01e395 100644 --- a/tests/test_transcript.py +++ b/tests/test_transcript.py @@ -0,0 +1,87 @@ +import pytest +from podcastfy.core.transcript import clean_markups, TranscriptSegment, Transcript, Character +from unittest.mock import patch, mock_open + +@pytest.fixture +def characters(): + character1 = Character("Person1", "John Doe", {}) + character2 = Character("Person2", "Jane Smith", {}) + return {"Person1": character1, "Person2": character2} + +def test_clean_markups(): + input_text = "Hello World. This is a test" + expected_output = "Hello World. This is a test" + assert clean_markups(input_text) == expected_output + +def test_clean_markups_with_scratchpad(): + input_text = "Hello (scratchpad)World" + expected_output = "Hello World" + assert clean_markups(input_text) == expected_output + +def test_transcript_segment_init(characters): + segment = TranscriptSegment("Hello World Test", characters["Person1"]) + assert segment.text == "Hello World Test" + assert segment.speaker == characters["Person1"] + +def test_transcript_segment_to_dict(characters): + segment = TranscriptSegment("Hello World", characters["Person1"], {"voice_id": "test_voice"}) + expected_dict = { + "text": "Hello World", + "speaker": "Person1", + "tts_args": {"voice_id": "test_voice"} + } + assert segment.to_dict() == expected_dict + +def test_transcript_segment_from_dict(characters): + data = { + "text": "Hello World", + "speaker": "Person1", + "tts_args": {"voice_id": "test_voice"} + } + segment = TranscriptSegment.from_dict(data, characters) + assert segment.text == "Hello World" + assert segment.speaker == characters["Person1"] + assert segment.tts_args == {"voice_id": "test_voice"} + +def test_transcript_init(characters): + segments = [ + TranscriptSegment("Hello", characters["Person1"]), + TranscriptSegment("Hi there", characters["Person2"]) + ] + transcript = Transcript(segments, {"title": "Test Transcript"}) + assert len(transcript.segments) == 2 + assert transcript.metadata == {"title": "Test Transcript"} + +def test_transcript_to_dict(characters): + segments = [ + TranscriptSegment("Hello", characters["Person1"]), + TranscriptSegment("Hi there", characters["Person2"]) + ] + transcript = Transcript(segments, {"title": "Test Transcript"}) + expected_dict = { + "segments": [ + {"text": "Hello", "speaker": "Person1", "tts_args": {}}, + {"text": "Hi there", "speaker": "Person2", "tts_args": {}} + ], + "metadata": {"title": "Test Transcript"} + } + assert transcript.to_dict() == expected_dict + +@pytest.mark.parametrize("file_content,expected_segments", [ + ('{"segments": [{"text": "Hello", "speaker": "Person1", "tts_args": {}}], "metadata": {}}', 1), + ('Hello\nHi there', 2) +]) +def test_transcript_load(file_content, expected_segments, characters): + with patch('builtins.open', new_callable=mock_open, read_data=file_content): + transcript = Transcript.load("fake_path.json", characters) + assert len(transcript.segments) == expected_segments + assert transcript.segments[0].speaker == characters["Person1"] + +def test_transcript_str(characters): + segments = [ + TranscriptSegment("Hello", characters["Person1"]), + TranscriptSegment("Hi there", characters["Person2"]) + ] + transcript = Transcript(segments, {"title": "Test Transcript"}) + expected_str = "Metadata:\ntitle: Test Transcript\n\nTranscript:\nPerson1: Hello\nPerson2: Hi there" + assert str(transcript) == expected_str \ No newline at end of file From 0aa7070ec27d9b1d13d39b91907de8b7105a1c06 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 20:42:50 +0200 Subject: [PATCH 32/49] fix regex ? --- podcastfy/core/transcript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index eec29bf6..cf1c4c66 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -22,7 +22,7 @@ def clean_markups(input_text: str) -> str: ] # Append additional tags to the supported tags list # Create a pattern that matches any tag not in the supported list - pattern = r']+>' + pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>' # Remove unsupported tags cleaned_text = re.sub(pattern, '', input_text) From b7fe017c622931d6ee98ce328de737c1d8cac34d Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 20:51:22 +0200 Subject: [PATCH 33/49] private static method --- podcastfy/core/transcript.py | 57 ++++++++++++++++++------------------ tests/test_transcript.py | 6 ++-- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index cf1c4c66..b8860a22 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -5,42 +5,43 @@ from podcastfy.core.character import Character -def clean_markups(input_text: str) -> str: - """ - Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. - - Args: - input_text (str): The input text containing TSS markup tags. - - Returns: - str: Cleaned text with unsupported TSS markup tags removed. - """ - # List of SSML tags supported by both OpenAI and ElevenLabs - supported_tags = [ - 'speak', 'lang', 'p', 'phoneme', - 's', 'say-as', 'sub' - ] - # Append additional tags to the supported tags list - # Create a pattern that matches any tag not in the supported list - pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>' - - # Remove unsupported tags - cleaned_text = re.sub(pattern, '', input_text) - - # Remove any leftover empty lines - cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text) - cleaned_text = cleaned_text.replace('(scratchpad)', '') - return cleaned_text - class TranscriptSegment: def __init__(self, text: str, speaker: Character, tts_args: Optional[Dict[str, Any]] = None, auto_clean_markup=True) -> None: - self.text = clean_markups(text) if auto_clean_markup else text + self.text = self._clean_markups(text) if auto_clean_markup else text self.speaker = speaker self.tts_args = tts_args or {} + @staticmethod + def _clean_markups(input_text: str) -> str: + """ + Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. + + Args: + input_text (str): The input text containing TSS markup tags. + + Returns: + str: Cleaned text with unsupported TSS markup tags removed. + """ + # List of SSML tags supported by both OpenAI and ElevenLabs + supported_tags = [ + 'speak', 'speak', 'lang', 'p', 'phoneme', + 's', 'say-as', 'sub' + ] + # Append additional tags to the supported tags list + # Create a pattern that matches any tag not in the supported list + pattern = r'<(?!(?:/?' + '|'.join(supported_tags) + r')\b)[^>]+>' + + # Remove unsupported tags + cleaned_text = re.sub(pattern, '', input_text) + + # Remove any leftover empty lines + cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text) + cleaned_text = cleaned_text.replace('(scratchpad)', '') + return cleaned_text + def to_dict(self) -> Dict[str, Any]: return { "text": self.text, diff --git a/tests/test_transcript.py b/tests/test_transcript.py index cb01e395..1af5696e 100644 --- a/tests/test_transcript.py +++ b/tests/test_transcript.py @@ -1,5 +1,5 @@ import pytest -from podcastfy.core.transcript import clean_markups, TranscriptSegment, Transcript, Character +from podcastfy.core.transcript import TranscriptSegment, Transcript, Character from unittest.mock import patch, mock_open @pytest.fixture @@ -11,12 +11,12 @@ def characters(): def test_clean_markups(): input_text = "Hello World. This is a test" expected_output = "Hello World. This is a test" - assert clean_markups(input_text) == expected_output + assert TranscriptSegment._clean_markups(input_text) == expected_output def test_clean_markups_with_scratchpad(): input_text = "Hello (scratchpad)World" expected_output = "Hello World" - assert clean_markups(input_text) == expected_output + assert TranscriptSegment._clean_markups(input_text) == expected_output def test_transcript_segment_init(characters): segment = TranscriptSegment("Hello World Test", characters["Person1"]) From bcda52be809cf1aea2c58aa1122bbcff80297a07 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 21:47:21 +0200 Subject: [PATCH 34/49] add comment --- podcastfy/core/transcript.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/podcastfy/core/transcript.py b/podcastfy/core/transcript.py index b8860a22..785bd55d 100644 --- a/podcastfy/core/transcript.py +++ b/podcastfy/core/transcript.py @@ -80,6 +80,8 @@ def dump(self, filepath: str) -> None: @staticmethod def _parse_legacy_transcript(content: str) -> List[Tuple[str, str]]: + # in the future, Person should be replaced by any character name, but for now, it's Person + # this is tricky because we don't want to take a random tag as a character name, but maybe it's ok to assume that the first tag of each line is the character name pattern = r'\s*(.*?)\s*' matches = re.findall(pattern, content, re.DOTALL) return [('Person' + person_num, text) for person_num, text in matches] @@ -117,11 +119,9 @@ def to_dict(self) -> Dict[str, Any]: } def __str__(self) -> str: - """Convert the transcript to a string representation.""" + """Convert the transcript to a xml representation.""" lines = [] for segment in self.segments: - lines.append(f"{segment.speaker.name}: {segment.text}") + lines.append(f'<{segment.speaker.name}>{segment.text}') + return '\n'.join(lines) - metadata_str = "\n".join([f"{key}: {value}" for key, value in self.metadata.items()]) - - return f"Metadata:\n{metadata_str}\n\nTranscript:\n" + "\n".join(lines) From b44a1b798f515c5e9150b55697fb16c1b45a1f6f Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 22:02:55 +0200 Subject: [PATCH 35/49] its currently expected that transcript are automatically saved --- podcastfy/client_v2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index 7f598ebc..aa39457a 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -128,15 +128,14 @@ def process_content_v2( characters=characters, ) - + directories = config.get("output_directories") + random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" + random_filename_mp3 = f"{random_filename_no_suffix}.mp3" + random_filename_transcript = f"{random_filename_no_suffix}.txt" if generate_audio: podcast.finalize() # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object - random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" - random_filename_mp3 = f"{random_filename_no_suffix}.mp3" - random_filename_transcript = f"{random_filename_no_suffix}.txt" - directories = config.get("output_directories") audio_file = os.path.join( directories["audio"], random_filename_mp3 ) @@ -145,6 +144,7 @@ def process_content_v2( return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file else: podcast.build_transcript() + podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) return None # note: should return the podcast object instead, but for the sake of the tests, we return None except Exception as e: From 8ca5fafe5a68e8445ff4677afd6262ece163b197 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 22:04:13 +0200 Subject: [PATCH 36/49] less noise --- podcastfy/core/podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 2e0c40ff..7e660463 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -44,7 +44,7 @@ def wrapper(self, *args, **kwargs): result = func(self, *args, **kwargs) next_state = PodcastState(self.state.value + 1) self.state = next_state or self.state - print(f"Done! Current State: {self.state.name}") + print(f"Done!") return result except Exception as e: print(f"Error in {func.__name__}: {str(e)}") From fe55253915cfaaaa3084382199f86df3e2c7da34 Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 22:39:03 +0200 Subject: [PATCH 37/49] fix transcript --- must_do_before_merge.txt | 12 ++++++------ tests/test_transcript.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt index 38e14f3f..b2526da1 100644 --- a/must_do_before_merge.txt +++ b/must_do_before_merge.txt @@ -1,8 +1,8 @@ -- one test or two on the Podcast Class -- delete client_v2 and merge it with client -- check that all config options are taken -- remove the excessive prints -- check that all tts work +- one test or two on the Podcast Class [x] +- delete client_v2 and merge it with client [] will be done during PR +- check that all config options are taken [x] +- remove the excessive prints [x] +- check that all tts work [x] - ... ? - 100% of current pytest unit tests pass [x] (except for test_generate_podcast_with_custom_config, exhausted credits) -- 100% of of CLI case scenarios from usage/cli.md +- 100% of of CLI case scenarios from usage/cli.md [x] except local diff --git a/tests/test_transcript.py b/tests/test_transcript.py index 1af5696e..c60ac128 100644 --- a/tests/test_transcript.py +++ b/tests/test_transcript.py @@ -83,5 +83,5 @@ def test_transcript_str(characters): TranscriptSegment("Hi there", characters["Person2"]) ] transcript = Transcript(segments, {"title": "Test Transcript"}) - expected_str = "Metadata:\ntitle: Test Transcript\n\nTranscript:\nPerson1: Hello\nPerson2: Hi there" + expected_str = "Hello\nHi there" assert str(transcript) == expected_str \ No newline at end of file From 977f78e28b9dd528d2b4083cf2103c715bbabf7f Mon Sep 17 00:00:00 2001 From: bruno Date: Thu, 17 Oct 2024 22:40:14 +0200 Subject: [PATCH 38/49] remove obsolete todos, and reformulate a todo --- podcastfy/aiengines/tts/base.py | 3 +-- podcastfy/aiengines/tts/tts_backends.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/podcastfy/aiengines/tts/base.py b/podcastfy/aiengines/tts/base.py index f251a8b2..a776bd08 100644 --- a/podcastfy/aiengines/tts/base.py +++ b/podcastfy/aiengines/tts/base.py @@ -54,7 +54,6 @@ class TTSConfigMixin: """Mixin class to manage TTS external configurations.""" def __init__(self, config_file: str = 'podcastfy/conversation_config.yaml', name: str = "") -> None: - # TODO: probably bad config files for final client self.name = name self.config_file = config_file self.default_configs = self._load_default_configs() @@ -74,7 +73,7 @@ def update_default_config(self, new_config: Dict[str, Any]) -> None: self.default_configs.update(new_config) def tts_config_for_character(self, character: Character) -> TTSConfig: - # todo a bit constrained by the fact that the config has just the question and answer fields + # note: a bit constrained by the fact that the config has just the question and answer fields if character.name in self.character_tts_mapping: return self.character_tts_mapping[character.name] diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 37cbf65c..58be2dc6 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -19,7 +19,6 @@ class ElevenLabsTTS(SyncTTSBackend, AsyncTTSBackend, TTSConfigMixin): name: str = "elevenlabs" def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'): - # TODO: not the right path for final client TTSConfigMixin.__init__(self, config_file, name=self.name) self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY") From c361a0e1520353bee9ddaaf212606f42cd8c1820 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 00:02:47 +0200 Subject: [PATCH 39/49] update the API to put a more prominent place --- podcastfy/client_v2.py | 15 ++++++------ podcastfy/core/audio.py | 25 +++++++++++++------- podcastfy/core/podcast.py | 28 ++++++---------------- tests/test_core_api.py | 49 ++++++++++++++------------------------- 4 files changed, 48 insertions(+), 69 deletions(-) diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py index aa39457a..6e455449 100644 --- a/podcastfy/client_v2.py +++ b/podcastfy/client_v2.py @@ -7,6 +7,7 @@ from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine from podcastfy.aiengines.tts.base import TTSBackend from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS +from podcastfy.core.audio import AudioManager from podcastfy.core.character import Character from podcastfy.core.content import Content from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend @@ -97,12 +98,15 @@ def process_content_v2( conv_config.configure(conversation_config) characters = create_characters(conv_config.config_conversation) tts_backends = obtain_tts_backend(config, tts_model) + audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format'] + temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir') + audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4) if transcript_file: logger.info(f"Using transcript file: {transcript_file}") transcript = Transcript.load( transcript_file, {char.name: char for char in characters} ) - podcast = Podcast.from_transcript(transcript, tts_backends, characters) + podcast = Podcast.from_transcript(transcript, audio_manager, characters) else: logger.info(f"Processing {len(urls)} links") content_extractor = ContentExtractor() @@ -118,13 +122,10 @@ def process_content_v2( llm_contents.extend( [Content(value=image_path, type="image_path") for image_path in image_paths] ) - - - podcast = Podcast( content=llm_contents, llm_backend=content_generator, - tts_backends=tts_backends, + audio_manager=audio_manager, characters=characters, ) @@ -152,9 +153,9 @@ def process_content_v2( raise -def obtain_tts_backend(config, tts_model): +def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]: # temporary solution tts_backends = create_tts_backends(config) # filter out the tts backends that are not in the tts_model, temporary solution - tts_backends = [tts for tts in tts_backends if tts.name == tts_model] + tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model} return tts_backends diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index 663f9d59..ab6fab77 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -1,11 +1,13 @@ import asyncio +import atexit from concurrent.futures import ThreadPoolExecutor from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional, Dict, Union, List, cast, Tuple from pydub import AudioSegment -from podcastfy.core.podcast import SyncTTSBackend, AsyncTTSBackend +from podcastfy.aiengines.tts.base import TTSBackend, SyncTTSBackend, AsyncTTSBackend from podcastfy.core.transcript import TranscriptSegment, Transcript @@ -26,20 +28,25 @@ def audio(self) -> AudioSegment: class AudioManager: - def __init__(self, tts_backends: Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]], n_jobs: int = 4, file_prefix: str = "") -> None: + def __init__(self, tts_backends: Dict[str, TTSBackend], audio_format, n_jobs: int = 4, file_prefix: str = "", audio_temp_dir: str = None) -> None: + self.audio_format = audio_format self.tts_backends = tts_backends self.n_jobs = n_jobs self.has_async_backend = any(isinstance(backend, AsyncTTSBackend) for backend in self.tts_backends.values()) self.file_prefix = file_prefix - self.audio_segments = [] self.final_audio: Optional[AudioSegment] = None - self.temp_dir: Optional[Union[str, Path]] = None + if audio_temp_dir: + self.temp_dir = Path(audio_temp_dir) + else: + self._temp_dir = TemporaryDirectory() + self.temp_dir = Path(self._temp_dir.name) + atexit.register(self._temp_dir.cleanup) async def _async_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: async def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): segment, index = segment_tuple - tts_backend = self.get_tts_backend(segment) - audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.mp3" + tts_backend = self._get_tts_backend(segment) + audio_path = Path(self.temp_dir) / f"{self.file_prefix}{index:04d}.{self.audio_format}" if isinstance(tts_backend, AsyncTTSBackend): await tts_backend.async_text_to_speech( segment.text, @@ -63,7 +70,7 @@ async def bounded_process_segment(segment_tuple): tasks = [asyncio.create_task(bounded_process_segment((segment, i))) for i, segment in enumerate(transcript.segments)] return list(await asyncio.gather(*tasks)) - def get_tts_backend(self, segment): + def _get_tts_backend(self, segment): tts_backend = self.tts_backends.get(segment.speaker.preferred_tts) if tts_backend is None: # Take the first available TTS backend @@ -73,8 +80,8 @@ def get_tts_backend(self, segment): def _sync_build_audio_segments(self, transcript: Transcript) -> List[PodcastsAudioSegment]: def process_segment(segment_tuple: Tuple[TranscriptSegment, int]): segment, index = segment_tuple - tts_backend = self.get_tts_backend(segment) - filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.mp3" + tts_backend = self._get_tts_backend(segment) + filepath = Path(str(self.temp_dir)) / f"{self.file_prefix}{index:04d}.{self.audio_format}" cast(SyncTTSBackend, tts_backend).text_to_speech( segment.text, segment.speaker, diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 7e660463..3a93f951 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -57,9 +57,8 @@ class Podcast: """Main class for podcast creation and management.""" def __init__(self, content: List[Content], llm_backend: LLMBackend, - tts_backends: List[TTSBackend], audio_temp_dir: Optional[Union[str, Path]] = None, - characters: Optional[List[Character]] = None, - default_tts_n_jobs: int = 1) -> None: + audio_manager: AudioManager, + characters: Optional[List[Character]] = None): """ Initialize a new Podcast instance. @@ -78,20 +77,10 @@ def __init__(self, content: List[Content], llm_backend: LLMBackend, """ self.content = content self.llm_backend = llm_backend - self.tts_backends: Dict[str, TTSBackend] = {backend.name: backend for backend in tts_backends} self.characters: Dict[str, Character] = {char.name: char for char in (characters or [Character("Host", "Podcast host", {}), Character("Guest", "Expert guest", {})])} - self.default_tts_n_jobs = default_tts_n_jobs self.state = PodcastState.INITIALIZED self._reworking = False - - if audio_temp_dir: - self.temp_dir = Path(audio_temp_dir) - else: - self._temp_dir = TemporaryDirectory() - self.temp_dir = Path(self._temp_dir.name) - atexit.register(self._temp_dir.cleanup) - self.audio_manager = AudioManager(self.tts_backends, self.default_tts_n_jobs) - self.audio_manager.temp_dir = self.temp_dir + self.audio_manager = audio_manager # Initialize attributes with null values self.transcript: Optional[Transcript] = None @@ -111,23 +100,20 @@ def __del__(self) -> None: @classmethod def from_transcript(cls, transcript: Union[Sequence[Tuple[str, str]], Transcript], - tts_backends: List[Union[SyncTTSBackend, AsyncTTSBackend]], - characters: List[Character], default_tts_n_jobs: int = 1) -> 'Podcast': + audio_manager: AudioManager, + characters: List[Character]) -> 'Podcast': """ Create a Podcast instance from a pre-existing transcript. Args: transcript (Union[Sequence[Tuple[str, str]], Transcript]): Pre-existing transcript. - tts_backends (Dict[str, Union[SyncTTSBackend, AsyncTTSBackend]]): Dictionary of available TTS backends. + audio_manager (AudioManager): The audio manager instance for creating audio segments. characters (List[Character]): List of characters participating in the podcast. - default_tts_n_jobs (int, optional): The default number of concurrent jobs for TTS processing. - Defaults to 1. - Returns: Podcast: A new Podcast instance with the transcript built and ready for audio generation. """ if isinstance(transcript, Transcript): - podcast = cls("", cast(LLMBackend, None), tts_backends, characters=characters, default_tts_n_jobs=default_tts_n_jobs) + podcast = cls("", cast(LLMBackend, None), audio_manager=audio_manager, characters=characters) podcast.transcript = transcript else: raise ValueError("Transcript must be a Transcript instance") # unimplemented diff --git a/tests/test_core_api.py b/tests/test_core_api.py index fba450d1..33cf4571 100644 --- a/tests/test_core_api.py +++ b/tests/test_core_api.py @@ -6,11 +6,10 @@ from podcastfy.core.content import Content from podcastfy.core.podcast import Podcast, PodcastState from podcastfy.aiengines.llm.base import LLMBackend -from podcastfy.aiengines.tts.base import SyncTTSBackend from podcastfy.core.character import Character from podcastfy.core.tts_configs import TTSConfig from podcastfy.core.transcript import TranscriptSegment, Transcript - +from podcastfy.core.audio import AudioManager class DummyLLMBackend(LLMBackend): def generate_transcript(self, content, characters): @@ -19,8 +18,7 @@ def generate_transcript(self, content, characters): (characters[1], "Thanks for having me!") ] - -class DummyTTSBackend(SyncTTSBackend): +class DummyTTSBackend: def __init__(self, name: str): self.name = name @@ -29,16 +27,15 @@ def text_to_speech(self, text: str, character: Character, output_path: Path) -> audio.export(str(output_path), format="mp3") return output_path - @pytest.fixture -def tts_backends(): - return [DummyTTSBackend("openai"), DummyTTSBackend("elevenlabs")] - +def audio_manager(tmp_path): + tts_backends = {"openai": DummyTTSBackend("openai"), "elevenlabs": DummyTTSBackend("elevenlabs")} + return AudioManager(tts_backends, audio_format="mp3", audio_temp_dir=tmp_path, n_jobs=1) @pytest.fixture def characters(): host = Character( - name="Host", + name="Person1", role="Podcast host", tts_configs={ "openai": TTSConfig(voice="en-US-Neural2-F", backend="openai", extra_args={"speaking_rate": 1.0}), @@ -48,7 +45,7 @@ def characters(): ) guest = Character( - name="Guest", + name="Person2", role="Expert guest", tts_configs={ "openai": TTSConfig(voice="en-US-Neural2-D", backend="openai", extra_args={"pitch": -2.0}), @@ -59,38 +56,32 @@ def characters(): return [host, guest] - @pytest.fixture -def podcast(tts_backends, characters): +def podcast(audio_manager, characters): return Podcast( content=[Content(value="This is a sample content for our podcast.", type="text")], llm_backend=DummyLLMBackend(), - tts_backends=tts_backends, + audio_manager=audio_manager, characters=characters, ) - def test_podcast_initialization(podcast): assert podcast.state == PodcastState.INITIALIZED assert podcast.transcript is None - assert podcast.audio_segments == [] assert podcast.audio is None - def test_build_transcript(podcast): podcast.build_transcript() assert podcast.state == PodcastState.TRANSCRIPT_BUILT assert isinstance(podcast.transcript, Transcript) assert len(podcast.transcript.segments) == 2 - def test_build_audio_segments(podcast): podcast.build_transcript() podcast.build_audio_segments() assert podcast.state == PodcastState.AUDIO_SEGMENTS_BUILT assert len(podcast.audio_segments) == 2 - def test_stitch_audio_segments(podcast): podcast.build_transcript() podcast.build_audio_segments() @@ -98,7 +89,6 @@ def test_stitch_audio_segments(podcast): assert podcast.state == PodcastState.STITCHED assert isinstance(podcast.audio, AudioSegment) - def test_finalize(podcast): podcast.finalize() assert podcast.state == PodcastState.STITCHED @@ -106,45 +96,41 @@ def test_finalize(podcast): assert len(podcast.audio_segments) > 0 assert isinstance(podcast.audio, AudioSegment) - def test_save(podcast, tmp_path): podcast.finalize() output_file = tmp_path / "test_podcast.mp3" podcast.save(str(output_file)) assert output_file.exists() - def test_export_transcript(podcast, tmp_path): podcast.finalize() output_file = tmp_path / "test_transcript.txt" podcast.export_transcript(str(output_file), format_="plaintext") assert output_file.exists() - def test_rework(podcast): podcast.finalize() with podcast.rework(PodcastState.TRANSCRIPT_BUILT): assert podcast.state == PodcastState.TRANSCRIPT_BUILT podcast.transcript.segments.append( - TranscriptSegment("This is a new segment", podcast.characters["Host"])) + TranscriptSegment("This is a new segment", podcast.characters["Person1"])) assert podcast.state == PodcastState.STITCHED assert len(podcast.transcript.segments) == 3 - -def test_from_transcript(tts_backends, characters): +def test_from_transcript(audio_manager, characters): pre_existing_transcript = [ - ("Host", "Welcome to our podcast created from a pre-existing transcript!"), - ("Guest", "Thank you for having me. I'm excited to be here.") + ("Person1", "Welcome to our podcast created from a pre-existing transcript!"), + ("Person2", "Thank you for having me. I'm excited to be here.") ] podcast = Podcast.from_transcript( transcript=Transcript([ - TranscriptSegment(text, characters[0] if speaker == "Host" else characters[1]) + TranscriptSegment(text, characters[0] if speaker == "Person1" else characters[1]) for speaker, text in pre_existing_transcript ]), - tts_backends=tts_backends, + audio_manager=audio_manager, characters=characters ) @@ -154,8 +140,7 @@ def test_from_transcript(tts_backends, characters): podcast.finalize() assert podcast.state == PodcastState.STITCHED - -def test_load_transcript(tts_backends, characters, tmp_path): +def test_load_transcript(audio_manager, characters, tmp_path): # Create a dummy transcript file transcript_file = tmp_path / "test_transcript.json" Transcript([ @@ -163,6 +148,6 @@ def test_load_transcript(tts_backends, characters, tmp_path): TranscriptSegment("Thank you for having me!", characters[1]) ]).dump(str(transcript_file)) - podcast = Podcast.load_transcript(str(transcript_file), tts_backends, characters) + podcast = Podcast.load_transcript(str(transcript_file), audio_manager, characters) assert podcast.state == PodcastState.TRANSCRIPT_BUILT assert len(podcast.transcript.segments) == 2 \ No newline at end of file From 61c42af6ad913eb7218c911e0412f6d61548d9c8 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 00:32:00 +0200 Subject: [PATCH 40/49] remove temp file --- must_do_before_merge.txt | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 must_do_before_merge.txt diff --git a/must_do_before_merge.txt b/must_do_before_merge.txt deleted file mode 100644 index b2526da1..00000000 --- a/must_do_before_merge.txt +++ /dev/null @@ -1,8 +0,0 @@ -- one test or two on the Podcast Class [x] -- delete client_v2 and merge it with client [] will be done during PR -- check that all config options are taken [x] -- remove the excessive prints [x] -- check that all tts work [x] -- ... ? -- 100% of current pytest unit tests pass [x] (except for test_generate_podcast_with_custom_config, exhausted credits) -- 100% of of CLI case scenarios from usage/cli.md [x] except local From 17c14720d969147f3fc7d87e6db0ef3841166d82 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 01:55:27 +0200 Subject: [PATCH 41/49] rework audio tests and add pytest-asyncio in the dependencies --- pyproject.toml | 2 + requirements.txt | 1 + tests/test_audio.py | 98 +++++++++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 48 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9fb07aa4..4758f2eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,10 +44,12 @@ types-pyyaml = "^6.0.12.20240917" nest-asyncio = "^1.6.0" ffmpeg = "^1.4" pytest = "^8.3.3" +pytest-asyncio = "^0.24.0" [tool.poetry.group.dev.dependencies] pytest = "^8.3.3" +pytest-asyncio = "^0.24.0" black = "^24.8.0" sphinx = ">=8.0.2" nbsphinx = "0.9.5" diff --git a/requirements.txt b/requirements.txt index e24bccf3..645987c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -108,6 +108,7 @@ pygments==2.18.0 ; python_version >= "3.11" and python_version < "4.0" pymupdf==1.24.11 ; python_version >= "3.11" and python_version < "4.0" pyparsing==3.2.0 ; python_version >= "3.11" and python_version < "4.0" pytest==8.3.3 ; python_version >= "3.11" and python_version < "4.0" +pytest-asyncio==0.24.0 ; python_version >= "3.11" and python_version < "4.0" python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "4.0" python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "4.0" python-levenshtein==0.26.0 ; python_version >= "3.11" and python_version < "4.0" diff --git a/tests/test_audio.py b/tests/test_audio.py index 9e72d044..77fe5047 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -1,50 +1,52 @@ -import unittest import pytest import os -from podcastfy.text_to_speech import TextToSpeech - - -class TestAudio(unittest.TestCase): - def setUp(self): - self.test_text = "Hello, how are you?I'm doing great, thanks for asking!" - self.output_dir = "tests/data/audio" - os.makedirs(self.output_dir, exist_ok=True) - - @pytest.mark.skip(reason="Testing edge only on Github Action as it's free") - def test_text_to_speech_openai(self): - tts = TextToSpeech(model="openai") - output_file = os.path.join(self.output_dir, "test_openai.mp3") - tts.convert_to_speech(self.test_text, output_file) - - self.assertTrue(os.path.exists(output_file)) - self.assertGreater(os.path.getsize(output_file), 0) - - # Clean up - os.remove(output_file) - - @pytest.mark.skip(reason="Testing edge only on Github Action as it's free") - def test_text_to_speech_elevenlabs(self): - tts = TextToSpeech(model="elevenlabs") - output_file = os.path.join(self.output_dir, "test_elevenlabs.mp3") - tts.convert_to_speech(self.test_text, output_file) - - self.assertTrue(os.path.exists(output_file)) - self.assertGreater(os.path.getsize(output_file), 0) - - # Clean up - os.remove(output_file) - - def test_text_to_speech_edge(self): - tts = TextToSpeech(model="edge") - output_file = os.path.join(self.output_dir, "test_edge.mp3") - tts.convert_to_speech(self.test_text, output_file) - - self.assertTrue(os.path.exists(output_file)) - self.assertGreater(os.path.getsize(output_file), 0) - - # Clean up - os.remove(output_file) - - -if __name__ == "__main__": - unittest.main() +from pathlib import Path +from podcastfy.core.character import Character +from podcastfy.aiengines.tts.tts_backends import ElevenLabsTTS, OpenAITTS, EdgeTTS + +@pytest.fixture +def test_setup(): + test_text = "Hello, how are you?I'm doing great, thanks for asking!" + output_dir = Path("tests/data/audio") + output_dir.mkdir(parents=True, exist_ok=True) + dummy_character = Character("test_character", "host", {}, "A test character") + return test_text, output_dir, dummy_character + +@pytest.mark.skip(reason="Testing Eleven Labs only on Github Action as it requires API key") +def test_text_to_speech_elevenlabs(test_setup): + test_text, output_dir, dummy_character = test_setup + tts = ElevenLabsTTS() + output_file = output_dir / "test_elevenlabs.mp3" + tts.text_to_speech(test_text, dummy_character, output_file) + + assert output_file.exists() + assert output_file.stat().st_size > 0 + + # Clean up + output_file.unlink() + +@pytest.mark.skip(reason="Testing OpenAI only on Github Action as it requires API key") +def test_text_to_speech_openai(test_setup): + test_text, output_dir, dummy_character = test_setup + tts = OpenAITTS() + output_file = output_dir / "test_openai.mp3" + tts.text_to_speech(test_text, dummy_character, output_file) + + assert output_file.exists() + assert output_file.stat().st_size > 0 + + # Clean up + output_file.unlink() + +@pytest.mark.asyncio +async def test_text_to_speech_edge(test_setup): + test_text, output_dir, dummy_character = test_setup + tts = EdgeTTS() + output_file = output_dir / "test_edge.mp3" + await tts.async_text_to_speech(test_text, dummy_character, output_file) + + assert output_file.exists() + assert output_file.stat().st_size > 0 + + # Clean up + output_file.unlink() \ No newline at end of file From a2f9c1e41e84b0cbac229f7b034f49d2e3d53dec Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 07:28:47 +0200 Subject: [PATCH 42/49] clean unused module, merge back into client.py --- podcastfy/client.py | 241 ++++++++++++++---------- podcastfy/client_v2.py | 161 ---------------- podcastfy/text_to_speech.py | 353 ------------------------------------ 3 files changed, 146 insertions(+), 609 deletions(-) delete mode 100644 podcastfy/client_v2.py delete mode 100644 podcastfy/text_to_speech.py diff --git a/podcastfy/client.py b/podcastfy/client.py index 86917a43..d6a4a467 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -5,120 +5,171 @@ from URLs or existing transcript files. It orchestrates the content extraction, generation, and text-to-speech conversion processes. """ +import copy import os import uuid import typer import yaml + +from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine +from podcastfy.aiengines.tts.base import TTSBackend +from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS +from podcastfy.core.audio import AudioManager +from podcastfy.core.character import Character +from podcastfy.core.content import Content +from podcastfy.core.podcast import Podcast +from podcastfy.core.transcript import Transcript from podcastfy.content_parser.content_extractor import ContentExtractor -from podcastfy.content_generator import ContentGenerator -from podcastfy.text_to_speech import TextToSpeech +from podcastfy.core.tts_configs import TTSConfig from podcastfy.utils.config import Config, load_config from podcastfy.utils.config_conversation import ( - ConversationConfig, load_conversation_config, ) from podcastfy.utils.logger import setup_logger from typing import List, Optional, Dict, Any -import copy from podcastfy.client_v2 import process_content_v2 as process_content - logger = setup_logger(__name__) app = typer.Typer() +def create_characters(config: Dict[str, Any]) -> List[Character]: + # in the future, we should load this from the config file + host = Character( + name="Person1", + role="Podcast host", + tts_configs={ + "openai": TTSConfig( + voice=config["text_to_speech"]["openai"]["default_voices"]["question"], + backend="openai", + ), + "elevenlabs": TTSConfig( + voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ + "question" + ], + backend="elevenlabs", + ), + }, + default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.", + ) + + guest = Character( + name="Person2", + role="Expert guest", + tts_configs={ + "openai": TTSConfig( + voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], + backend="openai", + ), + "elevenlabs": TTSConfig( + voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ + "answer" + ], + backend="elevenlabs", + ), + }, + default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.", + ) + + return [host, guest] + + +def create_tts_backends(config: Config) -> List[TTSBackend]: + return [ + OpenAITTS(api_key=config.OPENAI_API_KEY), + ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY), + EdgeTTS(), + ] + + + +def process_content( + urls: Optional[List[str]] = None, + transcript_file: Optional[str] = None, + tts_model: str = "openai", # to be fixed, in case of characters, it should be a list of models + generate_audio: bool = True, + config: Optional[Config] = None, + conversation_config: Optional[Dict[str, Any]] = None, + image_paths: Optional[List[str]] = None, + is_local: bool = False, +) -> str: + try: + if config is None: + config = load_config() + if urls is None: + urls = [] + if config is None: + config = load_config() + # Load default conversation config + conv_config = load_conversation_config() + + # Update with provided config if any + if conversation_config: + conv_config.configure(conversation_config) + characters = create_characters(conv_config.config_conversation) + tts_backends = obtain_tts_backend(config, tts_model) + audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format'] + temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir') + audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4) + if transcript_file: + logger.info(f"Using transcript file: {transcript_file}") + transcript = Transcript.load( + transcript_file, {char.name: char for char in characters} + ) + podcast = Podcast.from_transcript(transcript, audio_manager, characters) + else: + logger.info(f"Processing {len(urls)} links") + content_extractor = ContentExtractor() + content_generator = DefaultPodcastifyTranscriptEngine( + config.GEMINI_API_KEY, conversation_config, is_local=is_local + ) + + contents = [content_extractor.extract_content(url) for url in urls] + llm_contents = [] + if contents: + llm_contents.append(Content(value="\n\n".join(contents), type="text")) + if image_paths: + llm_contents.extend( + [Content(value=image_path, type="image_path") for image_path in image_paths] + ) + podcast = Podcast( + content=llm_contents, + llm_backend=content_generator, + audio_manager=audio_manager, + characters=characters, + ) + + directories = config.get("output_directories") + random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" + random_filename_mp3 = f"{random_filename_no_suffix}.mp3" + random_filename_transcript = f"{random_filename_no_suffix}.txt" + if generate_audio: + podcast.finalize() + + # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object + audio_file = os.path.join( + directories["audio"], random_filename_mp3 + ) + podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) + podcast.save(filepath=audio_file) + return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file + else: + podcast.build_transcript() + podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) + + return None # note: should return the podcast object instead, but for the sake of the tests, we return None + except Exception as e: + logger.error(f"An error occurred in the process_content function: {str(e)}") + raise + -# def process_content( -# urls=None, -# transcript_file=None, -# tts_model="openai", -# generate_audio=True, -# config=None, -# conversation_config: Optional[Dict[str, Any]] = None, -# image_paths: Optional[List[str]] = None, -# is_local: bool = False, -# ): -# """ -# Process URLs, a transcript file, or image paths to generate a podcast or transcript. -# -# Args: -# urls (Optional[List[str]]): A list of URLs to process. -# transcript_file (Optional[str]): Path to a transcript file. -# tts_model (str): The TTS model to use ('openai', 'elevenlabs' or 'edge'). Defaults to 'openai'. -# generate_audio (bool): Whether to generate audio or just a transcript. Defaults to True. -# config (Config): Configuration object to use. If None, default config will be loaded. -# conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. -# image_paths (Optional[List[str]]): List of image file paths to process. -# is_local (bool): Whether to use a local LLM. Defaults to False. -# -# Returns: -# Optional[str]: Path to the final podcast audio file, or None if only generating a transcript. -# """ -# try: -# if config is None: -# config = load_config() -# -# # Load default conversation config -# conv_config = load_conversation_config() -# -# # Update with provided config if any -# if conversation_config: -# conv_config.configure(conversation_config) -# -# if transcript_file: -# logger.info(f"Using transcript file: {transcript_file}") -# with open(transcript_file, "r") as file: -# qa_content = file.read() -# else: -# content_generator = ContentGenerator( -# api_key=config.GEMINI_API_KEY, conversation_config=conv_config.to_dict() -# ) -# -# if urls: -# logger.info(f"Processing {len(urls)} links") -# content_extractor = ContentExtractor() -# # Extract content from links -# contents = [content_extractor.extract_content(link) for link in urls] -# # Combine all extracted content -# combined_content = "\n\n".join(contents) -# else: -# combined_content = "" # Empty string if no URLs provided -# -# # Generate Q&A content -# random_filename = f"transcript_{uuid.uuid4().hex}.txt" -# transcript_filepath = os.path.join( -# config.get("output_directories")["transcripts"], random_filename -# ) -# qa_content = content_generator.generate_qa_content( -# combined_content, -# image_file_paths=image_paths or [], -# output_filepath=transcript_filepath, -# is_local=is_local, -# ) -# -# if generate_audio: -# api_key = None -# # edge does not require an API key -# if tts_model != "edge": -# api_key = getattr(config, f"{tts_model.upper()}_API_KEY") -# -# text_to_speech = TextToSpeech(model=tts_model, api_key=api_key) -# # Convert text to speech using the specified model -# random_filename = f"podcast_{uuid.uuid4().hex}.mp3" -# audio_file = os.path.join( -# config.get("output_directories")["audio"], random_filename -# ) -# text_to_speech.convert_to_speech(qa_content, audio_file) -# logger.info(f"Podcast generated successfully using {tts_model} TTS model") -# return audio_file -# else: -# logger.info(f"Transcript generated successfully") -# return None -# -# except Exception as e: -# logger.error(f"An error occurred in the process_content function: {str(e)}") -# raise +def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]: + # temporary solution + tts_backends = create_tts_backends(config) + # filter out the tts backends that are not in the tts_model, temporary solution + tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model} + return tts_backends @app.command() diff --git a/podcastfy/client_v2.py b/podcastfy/client_v2.py deleted file mode 100644 index 6e455449..00000000 --- a/podcastfy/client_v2.py +++ /dev/null @@ -1,161 +0,0 @@ -import os -import uuid -import typer -from pathlib import Path -from typing import List, Optional, Dict, Any, Union, Tuple - -from podcastfy.aiengines.llm.gemini_langchain import DefaultPodcastifyTranscriptEngine -from podcastfy.aiengines.tts.base import TTSBackend -from podcastfy.aiengines.tts.tts_backends import OpenAITTS, ElevenLabsTTS, EdgeTTS -from podcastfy.core.audio import AudioManager -from podcastfy.core.character import Character -from podcastfy.core.content import Content -from podcastfy.core.podcast import Podcast, SyncTTSBackend, AsyncTTSBackend -from podcastfy.core.transcript import Transcript -from podcastfy.content_parser.content_extractor import ContentExtractor -from podcastfy.core.tts_configs import TTSConfig -from podcastfy.utils.config import Config, load_config -from podcastfy.utils.config_conversation import load_conversation_config -from podcastfy.utils.logger import setup_logger - -logger = setup_logger(__name__) - -app = typer.Typer() - - -def create_characters(config: Dict[str, Any]) -> List[Character]: - # in the future, we should load this from the config file - host = Character( - name="Person1", - role="Podcast host", - tts_configs={ - "openai": TTSConfig( - voice=config["text_to_speech"]["openai"]["default_voices"]["question"], - backend="openai", - ), - "elevenlabs": TTSConfig( - voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ - "question" - ], - backend="elevenlabs", - ), - }, - default_description_for_llm="{name} is an enthusiastic podcast host. Speaks clearly and engagingly.", - ) - - guest = Character( - name="Person2", - role="Expert guest", - tts_configs={ - "openai": TTSConfig( - voice=config["text_to_speech"]["openai"]["default_voices"]["answer"], - backend="openai", - ), - "elevenlabs": TTSConfig( - voice=config["text_to_speech"]["elevenlabs"]["default_voices"][ - "answer" - ], - backend="elevenlabs", - ), - }, - default_description_for_llm="{name} is an expert guest. Shares knowledge in a friendly manner.", - ) - - return [host, guest] - - -def create_tts_backends(config: Config) -> List[TTSBackend]: - return [ - OpenAITTS(api_key=config.OPENAI_API_KEY), - ElevenLabsTTS(api_key=config.ELEVENLABS_API_KEY), - EdgeTTS(), - ] - - - -def process_content_v2( - urls: Optional[List[str]] = None, - transcript_file: Optional[str] = None, - tts_model: str = "openai", # to be fixed, in case of characters, it should be a list of models - generate_audio: bool = True, - config: Optional[Config] = None, - conversation_config: Optional[Dict[str, Any]] = None, - image_paths: Optional[List[str]] = None, - is_local: bool = False, -) -> Tuple[Optional[str], Podcast]: - try: - if config is None: - config = load_config() - if urls is None: - urls = [] - if config is None: - config = load_config() - # Load default conversation config - conv_config = load_conversation_config() - - # Update with provided config if any - if conversation_config: - conv_config.configure(conversation_config) - characters = create_characters(conv_config.config_conversation) - tts_backends = obtain_tts_backend(config, tts_model) - audio_format = conv_config.config_conversation.get('text_to_speech')['audio_format'] - temp_dir = conv_config.config_conversation.get('text_to_speech').get('temp_audio_dir') - audio_manager = AudioManager(tts_backends, audio_format=audio_format, audio_temp_dir=temp_dir, n_jobs=4) - if transcript_file: - logger.info(f"Using transcript file: {transcript_file}") - transcript = Transcript.load( - transcript_file, {char.name: char for char in characters} - ) - podcast = Podcast.from_transcript(transcript, audio_manager, characters) - else: - logger.info(f"Processing {len(urls)} links") - content_extractor = ContentExtractor() - content_generator = DefaultPodcastifyTranscriptEngine( - config.GEMINI_API_KEY, conversation_config, is_local=is_local - ) - - contents = [content_extractor.extract_content(url) for url in urls] - llm_contents = [] - if contents: - llm_contents.append(Content(value="\n\n".join(contents), type="text")) - if image_paths: - llm_contents.extend( - [Content(value=image_path, type="image_path") for image_path in image_paths] - ) - podcast = Podcast( - content=llm_contents, - llm_backend=content_generator, - audio_manager=audio_manager, - characters=characters, - ) - - directories = config.get("output_directories") - random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" - random_filename_mp3 = f"{random_filename_no_suffix}.mp3" - random_filename_transcript = f"{random_filename_no_suffix}.txt" - if generate_audio: - podcast.finalize() - - # for the sake of the tests currently in place, but in the future, we should remove this and return the podcast object - audio_file = os.path.join( - directories["audio"], random_filename_mp3 - ) - podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) - podcast.save(filepath=audio_file) - return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file - else: - podcast.build_transcript() - podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) - - return None # note: should return the podcast object instead, but for the sake of the tests, we return None - except Exception as e: - logger.error(f"An error occurred in the process_content function: {str(e)}") - raise - - -def obtain_tts_backend(config, tts_model) -> Dict[str, TTSBackend]: - # temporary solution - tts_backends = create_tts_backends(config) - # filter out the tts backends that are not in the tts_model, temporary solution - tts_backends = {tts.name: tts for tts in tts_backends if tts.name == tts_model} - return tts_backends diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py deleted file mode 100644 index 977272eb..00000000 --- a/podcastfy/text_to_speech.py +++ /dev/null @@ -1,353 +0,0 @@ -""" -Text-to-Speech Module - -This module provides functionality to convert text into speech using various TTS models. -It supports both ElevenLabs, OpenAI and Edge TTS services and handles the conversion process, -including cleaning of input text and merging of audio files. -""" - -import logging -import asyncio -import edge_tts -from elevenlabs import client as elevenlabs_client -from podcastfy.utils.config import load_config -from podcastfy.utils.config_conversation import load_conversation_config -from pydub import AudioSegment -import os -import re -import openai -from typing import List, Tuple, Optional, Union - -logger = logging.getLogger(__name__) - -class TextToSpeech: - def __init__(self, model: str = 'openai', api_key: Optional[str] = None): - """ - Initialize the TextToSpeech class. - - Args: - model (str): The model to use for text-to-speech conversion. - Options are 'elevenlabs', 'openai' or 'edge'. Defaults to 'openai'. - api_key (Optional[str]): API key for the selected text-to-speech service. - If not provided, it will be loaded from the config. - """ - self.model = model.lower() - self.config = load_config() - self.conversation_config = load_conversation_config() - self.tts_config = self.conversation_config.get('text_to_speech') - - if self.model == 'elevenlabs': - self.api_key = api_key or self.config.ELEVENLABS_API_KEY - self.client = elevenlabs_client.ElevenLabs(api_key=self.api_key) - elif self.model == 'openai': - self.api_key = api_key or self.config.OPENAI_API_KEY - openai.api_key = self.api_key - elif self.model == 'edge': - pass - else: - raise ValueError("Invalid model. Choose 'elevenlabs', 'openai' or 'edge'.") - - self.audio_format = self.tts_config['audio_format'] - self.temp_audio_dir = self.tts_config['temp_audio_dir'] - self.ending_message = self.tts_config['ending_message'] - - # Create temp_audio_dir if it doesn't exist - if not os.path.exists(self.temp_audio_dir): - os.makedirs(self.temp_audio_dir) - - def __merge_audio_files(self, input_dir: str, output_file: str) -> None: - """ - Merge all audio files in the input directory sequentially and save the result. - - Args: - input_dir (str): Path to the directory containing audio files. - output_file (str): Path to save the merged audio file. - """ - try: - # Function to sort filenames naturally - def natural_sort_key(filename: str) -> List[Union[int, str]]: - return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] - - combined = AudioSegment.empty() - audio_files = sorted( - [f for f in os.listdir(input_dir) if f.endswith(f".{self.audio_format}")], - key=natural_sort_key - ) - for file in audio_files: - if file.endswith(f".{self.audio_format}"): - file_path = os.path.join(input_dir, file) - combined += AudioSegment.from_file(file_path, format=self.audio_format) - - combined.export(output_file, format=self.audio_format) - logger.info(f"Merged audio saved to {output_file}") - except Exception as e: - logger.error(f"Error merging audio files: {str(e)}") - raise - - def convert_to_speech(self, text: str, output_file: str) -> None: - """ - Convert input text to speech and save as an audio file. - - Args: - text (str): Input text to convert to speech. - output_file (str): Path to save the output audio file. - - Raises: - Exception: If there's an error in converting text to speech. - """ - # Clean TSS markup tags from the input text - cleaned_text = self.clean_tss_markup(text) - - if self.model == 'elevenlabs': - self.__convert_to_speech_elevenlabs(cleaned_text, output_file) - elif self.model == 'openai': - self.__convert_to_speech_openai(cleaned_text, output_file) - elif self.model == 'edge': - self.__convert_to_speech_edge(cleaned_text, output_file) - - def __convert_to_speech_elevenlabs(self, text: str, output_file: str) -> None: - try: - qa_pairs = self.split_qa(text) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - question_audio = self.client.generate( - text=question, - voice=self.tts_config['elevenlabs']['default_voices']['question'], - model=self.tts_config['elevenlabs']['model'] - ) - answer_audio = self.client.generate( - text=answer, - voice=self.tts_config['elevenlabs']['default_voices']['answer'], - model=self.tts_config['elevenlabs']['model'] - ) - - # Save question and answer audio chunks - for audio in [question_audio, answer_audio]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - with open(file_name, "wb") as out: - for chunk in audio: - if chunk: - out.write(chunk) - audio_files.append(file_name) - - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with ElevenLabs: {str(e)}") - raise - - def __convert_to_speech_openai(self, text: str, output_file: str) -> None: - try: - qa_pairs = self.split_qa(text) - print(qa_pairs) - audio_files = [] - counter = 0 - for question, answer in qa_pairs: - for speaker, content in [ - (self.tts_config['openai']['default_voices']['question'], question), - (self.tts_config['openai']['default_voices']['answer'], answer) - ]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - response = openai.audio.speech.create( - model=self.tts_config['openai']['model'], - voice=speaker, - input=content - ) - with open(file_name, "wb") as file: - file.write(response.content) - - audio_files.append(file_name) - - # Merge all audio files and save the result - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with OpenAI: {str(e)}") - raise - - def get_or_create_eventloop(): - try: - return asyncio.get_event_loop() - except RuntimeError as ex: - if "There is no current event loop in thread" in str(ex): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return asyncio.get_event_loop() - - import nest_asyncio # type: ignore - get_or_create_eventloop() - nest_asyncio.apply() - - def __convert_to_speech_edge(self, text: str, output_file: str) -> None: - """ - Convert text to speech using Edge TTS. - - Args: - text (str): The input text to convert to speech. - output_file (str): The path to save the output audio file. - """ - try: - qa_pairs = self.split_qa(text) - audio_files = [] - counter = 0 - - async def edge_tts_conversion(text_chunk: str, output_path: str, voice: str): - tts = edge_tts.Communicate(text_chunk, voice) - await tts.save(output_path) - return - - async def process_qa_pairs(qa_pairs): - nonlocal counter - tasks = [] - for question, answer in qa_pairs: - for speaker, content in [ - (self.tts_config['edge']['default_voices']['question'], question), - (self.tts_config['edge']['default_voices']['answer'], answer) - ]: - counter += 1 - file_name = f"{self.temp_audio_dir}{counter}.{self.audio_format}" - tasks.append(asyncio.ensure_future(edge_tts_conversion(content, file_name, speaker))) - audio_files.append(file_name) - - await asyncio.gather(*tasks) - - asyncio.run(process_qa_pairs(qa_pairs)) - - # Merge all audio files - self.__merge_audio_files(self.temp_audio_dir, output_file) - - # Clean up individual audio files - for file in audio_files: - os.remove(file) - logger.info(f"Audio saved to {output_file}") - - except Exception as e: - logger.error(f"Error converting text to speech with Edge: {str(e)}") - raise - - - def split_qa(self, input_text: str) -> List[Tuple[str, str]]: - """ - Split the input text into question-answer pairs. - - Args: - input_text (str): The input text containing Person1 and Person2 dialogues. - - Returns: - List[Tuple[str, str]]: A list of tuples containing (Person1, Person2) dialogues. - """ - # Add ending message to the end of input_text - input_text += f"{self.ending_message}" - - # Regular expression pattern to match Person1 and Person2 dialogues - pattern = r'(.*?)\s*(.*?)' - - # Find all matches in the input text - matches = re.findall(pattern, input_text, re.DOTALL) - - # Process the matches to remove extra whitespace and newlines - processed_matches = [ - ( - ' '.join(person1.split()).strip(), - ' '.join(person2.split()).strip() - ) - for person1, person2 in matches - ] - return processed_matches - - # to be done: Add support for additional tags dynamically given TTS model. Right now it's the intersection of OpenAI/MS Edgeand ElevenLabs supported tags. - def clean_tss_markup(self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]) -> str: - """ - Remove unsupported TSS markup tags from the input text while preserving supported SSML tags. - - Args: - input_text (str): The input text containing TSS markup tags. - additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"]. - - Returns: - str: Cleaned text with unsupported TSS markup tags removed. - """ - # List of SSML tags supported by both OpenAI and ElevenLabs - supported_tags = [ - 'speak', 'lang', 'p', 'phoneme', - 's', 'say-as', 'sub' - ] - - # Append additional tags to the supported tags list - supported_tags.extend(additional_tags) - - # Create a pattern that matches any tag not in the supported list - pattern = r']+>' - - # Remove unsupported tags - cleaned_text = re.sub(pattern, '', input_text) - - # Remove any leftover empty lines - cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text) - - # Ensure closing tags for additional tags are preserved - for tag in additional_tags: - cleaned_text = re.sub(f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)', - f'<{tag}>\\1', - cleaned_text, - flags=re.DOTALL) - # Remove '(scratchpad)' from cleaned_text - cleaned_text = cleaned_text.replace('(scratchpad)', '') - - return cleaned_text.strip() - -def main(seed: int = 42) -> None: - """ - Main function to test the TextToSpeech class. - - Args: - seed (int): Random seed for reproducibility. Defaults to 42. - """ - try: - # Load configuration - config = load_config() - - # Read input text from file - with open('tests/data/transcript_336aa9f955cd4019bc1287379a5a2820.txt', 'r') as file: - input_text = file.read() - - # Test ElevenLabs - tts_elevenlabs = TextToSpeech(model='elevenlabs') - elevenlabs_output_file = 'tests/data/response_elevenlabs.mp3' - tts_elevenlabs.convert_to_speech(input_text, elevenlabs_output_file) - logger.info(f"ElevenLabs TTS completed. Output saved to {elevenlabs_output_file}") - - # Test OpenAI - tts_openai = TextToSpeech(model='openai') - openai_output_file = 'tests/data/response_openai.mp3' - tts_openai.convert_to_speech(input_text, openai_output_file) - logger.info(f"OpenAI TTS completed. Output saved to {openai_output_file}") - - # Test OpenAI - tts_edge = TextToSpeech(model='edge') - edge_output_file = 'tests/data/response_edge.mp3' - tts_edge.convert_to_speech(input_text, edge_output_file) - logger.info(f"Edge TTS completed. Output saved to {edge_output_file}") - - except Exception as e: - logger.error(f"An error occurred during text-to-speech conversion: {str(e)}") - raise - -if __name__ == "__main__": - main(seed=42) \ No newline at end of file From 83854a06a6c5df5332b2104f383f37647f90d1b1 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 07:36:01 +0200 Subject: [PATCH 43/49] fix inccorect merge --- podcastfy/client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/podcastfy/client.py b/podcastfy/client.py index 8ca9c7c9..6a0e14b5 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -158,9 +158,7 @@ def process_content( podcast.build_transcript() podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) logger.info(f"Transcript generated successfully: {random_filename_transcript}") - return transcript_filepath - - return None # note: should return the podcast object instead, but for the sake of the tests, we return None + return random_filename_transcript except Exception as e: logger.error(f"An error occurred in the process_content function: {str(e)}") raise From d6679d2159f4ef752aacb5fe2088944f618278b7 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 07:43:27 +0200 Subject: [PATCH 44/49] fix incorrect merge --- podcastfy/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/podcastfy/client.py b/podcastfy/client.py index 6a0e14b5..cda6bb52 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -28,7 +28,6 @@ ) from podcastfy.utils.logger import setup_logger from typing import List, Optional, Dict, Any -from podcastfy.client_v2 import process_content_v2 as process_content logger = setup_logger(__name__) From c6b78760c56f7bd0abde4f9cf82a201855ebd7d1 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 16:52:32 +0200 Subject: [PATCH 45/49] fix attempt --- podcastfy/core/audio.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/podcastfy/core/audio.py b/podcastfy/core/audio.py index ab6fab77..2591e5d3 100644 --- a/podcastfy/core/audio.py +++ b/podcastfy/core/audio.py @@ -1,5 +1,6 @@ import asyncio import atexit +import os from concurrent.futures import ThreadPoolExecutor from pathlib import Path from tempfile import TemporaryDirectory @@ -36,6 +37,7 @@ def __init__(self, tts_backends: Dict[str, TTSBackend], audio_format, n_jobs: in self.file_prefix = file_prefix self.final_audio: Optional[AudioSegment] = None if audio_temp_dir: + os.makedirs(audio_temp_dir, exist_ok=True) self.temp_dir = Path(audio_temp_dir) else: self._temp_dir = TemporaryDirectory() From 1640f32bf4141bc169c846bf6e5df397f13c57d3 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 17:06:15 +0200 Subject: [PATCH 46/49] correct filepaths --- podcastfy/client.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/podcastfy/client.py b/podcastfy/client.py index cda6bb52..48ff400a 100644 --- a/podcastfy/client.py +++ b/podcastfy/client.py @@ -143,6 +143,7 @@ def process_content( random_filename_no_suffix = f"podcast_{uuid.uuid4().hex}" random_filename_mp3 = f"{random_filename_no_suffix}.mp3" random_filename_transcript = f"{random_filename_no_suffix}.txt" + transcript_file_path = os.path.join(directories["transcripts"], random_filename_transcript) if generate_audio: podcast.finalize() @@ -150,14 +151,14 @@ def process_content( audio_file = os.path.join( directories["audio"], random_filename_mp3 ) - podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) + podcast.transcript.export(transcript_file_path) podcast.save(filepath=audio_file) return audio_file # note: should return the podcast object instead, but for the sake of the tests, we return the audio file else: podcast.build_transcript() - podcast.transcript.export(os.path.join(directories["transcripts"], random_filename_transcript)) + podcast.transcript.export(transcript_file_path) logger.info(f"Transcript generated successfully: {random_filename_transcript}") - return random_filename_transcript + return transcript_file_path except Exception as e: logger.error(f"An error occurred in the process_content function: {str(e)}") raise From 6f480e3a6f6fad7db903d633ea9bfeb00efab149 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 17:42:58 +0200 Subject: [PATCH 47/49] remove dead code --- podcastfy/aiengines/llm/gemini_langchain.py | 112 -------------------- 1 file changed, 112 deletions(-) diff --git a/podcastfy/aiengines/llm/gemini_langchain.py b/podcastfy/aiengines/llm/gemini_langchain.py index ebd09e1f..0b9084e0 100644 --- a/podcastfy/aiengines/llm/gemini_langchain.py +++ b/podcastfy/aiengines/llm/gemini_langchain.py @@ -27,118 +27,6 @@ logger = logging.getLogger(__name__) - -class OldContentGenerator: - # note: to be deleted but stays around few days for reference and troubleshooting - def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None): - """ - Initialize the ContentGenerator. - - Args: - api_key (str): API key for Google's Generative AI. - conversation_config (Optional[Dict[str, Any]]): Custom conversation configuration. - """ - os.environ["GOOGLE_API_KEY"] = api_key - self.config = load_config() - self.content_generator_config = self.config.get('content_generator', {}) - - # Load default conversation config and update with custom config if provided - - self.config_conversation = load_conversation_config(conversation_config) - - self.llm = ChatGoogleGenerativeAI( - model=self.content_generator_config.get('gemini_model', 'gemini-1.5-pro-latest'), - temperature=self.config_conversation.get('creativity', 0), - max_output_tokens=self.content_generator_config.get('max_output_tokens', 8192), - ) - - #pick podcastfy prompt from langchain hub - self.prompt_template = hub.pull(self.config.get('content_generator', {}).get('prompt_template', 'souzatharsis/podcastfy_')) - self.ending_message = self.config.get('text_to_speech')['ending_message'] - - self.parser = StrOutputParser() - - self.chain = (self.prompt_template | self.llm | self.parser) - - def generate_qa_content(self, input_texts: str, output_filepath: Optional[str] = None, characters: List[Character] = None) -> str: - """ - Generate Q&A content based on input texts. - - Args: - input_texts (str): Input texts to generate content from. - output_filepath (Optional[str]): Filepath to save the response content. Defaults to None. - - Returns: - str: Formatted Q&A content. - - Raises: - Exception: If there's an error in generating content. - """ - assert len(characters) == 2, "The number of characters should be 2 for this implementation" - try: - - - prompt_params = { - "input_text": input_texts, - "word_count": self.config_conversation.get('word_count'), - "conversation_style": ", ".join(self.config_conversation.get('conversation_style', [])), - "roles_person1": characters[0].role, - "roles_person2": characters[1].role, - "dialogue_structure": ", ".join(self.config_conversation.get('dialogue_structure', [])), - "podcast_name": self.config_conversation.get('podcast_name'), - "podcast_tagline": self.config_conversation.get('podcast_tagline'), - "output_language": self.config_conversation.get('output_language'), - "engagement_techniques": ", ".join(self.config_conversation.get('engagement_techniques', [])) - } - - self.response = self.chain.invoke(prompt_params) - - logger.info(f"Content generated successfully") - - if output_filepath: - with open(output_filepath, 'w') as file: - file.write(self.response) - logger.info(f"Response content saved to {output_filepath}") - - return self.response - except Exception as e: - logger.error(f"Error generating content: {str(e)}") - raise - -class LLMBackend: - def __init__( - self, - is_local: bool, - temperature: float, - max_output_tokens: int, - model_name: str, - ): - """ - Initialize the LLMBackend. - - Args: - is_local (bool): Whether to use a local LLM or not. - temperature (float): The temperature for text generation. - max_output_tokens (int): The maximum number of output tokens. - model_name (str): The name of the model to use. - """ - self.is_local = is_local - self.temperature = temperature - self.max_output_tokens = max_output_tokens - self.model_name = model_name - self.is_multimodal = not is_local # Does not assume local LLM is multimodal - - if is_local: - self.llm = Llamafile() - else: - self.llm = ChatGoogleGenerativeAI( - model=model_name, - temperature=temperature, - max_output_tokens=max_output_tokens, - ) - - - class DefaultPodcastifyTranscriptEngine(LLMBackend): def __init__(self, api_key: str, conversation_config: Optional[Dict[str, Any]] = None, is_local: bool = False): """ From c5ab289810aafd4a528b52368e06f26837d1e276 Mon Sep 17 00:00:00 2001 From: bruno Date: Fri, 18 Oct 2024 17:44:03 +0200 Subject: [PATCH 48/49] fix empty segments --- podcastfy/core/podcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podcastfy/core/podcast.py b/podcastfy/core/podcast.py index 3a93f951..2b112679 100644 --- a/podcastfy/core/podcast.py +++ b/podcastfy/core/podcast.py @@ -162,7 +162,7 @@ def build_transcript(self) -> None: for segment in generated_segments: if isinstance(segment, tuple) and len(segment) == 2: speaker, text = segment - if speaker.name in self.characters: + if speaker.name in self.characters and text.strip(): tts_config = cast(Dict[str, Any], self.characters[speaker.name].tts_configs.get(self.characters[speaker.name].preferred_tts, {})) segments.append(TranscriptSegment(text, self.characters[speaker.name], tts_config)) else: From 0b7882a135d65d01c8e96bc11ba717a8eaf1286e Mon Sep 17 00:00:00 2001 From: bruno Date: Sat, 19 Oct 2024 16:56:39 +0200 Subject: [PATCH 49/49] a fix and one improvement --- podcastfy/aiengines/tts/tts_backends.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/podcastfy/aiengines/tts/tts_backends.py b/podcastfy/aiengines/tts/tts_backends.py index 58be2dc6..83e59b30 100644 --- a/podcastfy/aiengines/tts/tts_backends.py +++ b/podcastfy/aiengines/tts/tts_backends.py @@ -53,7 +53,7 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat class OpenAITTS(SyncTTSBackend, TTSConfigMixin): name: str = "openai" - def __init__(self, api_key: str = None, config_file: str = 'podcastfy/config.yaml'): + def __init__(self, api_key: str = None, config_file: str = 'podcastfy/conversation_config.yaml'): TTSConfigMixin.__init__(self, config_file, name=self.name) self.api_key = api_key or os.getenv("OPENAI_API_KEY") @@ -83,6 +83,11 @@ async def async_text_to_speech(self, text: str, character: Character, output_pat communicate = edge_tts.Communicate(text, config.voice) await communicate.save(str(output_path)) +# register +SyncTTSBackend.register(ElevenLabsTTS) +AsyncTTSBackend.register(ElevenLabsTTS) +SyncTTSBackend.register(OpenAITTS) +AsyncTTSBackend.register(EdgeTTS)