Capture LLM cost of TTS

MrDesjardins · MrDesjardins · commit b14b4a20b78b · 2026-02-09T19:11:55.000-08:00
diff --git a/services/llm_clients.py b/services/llm_clients.py
@@ -167,6 +167,66 @@ def create_chat_completion(
         except Exception:
             raise
 
+    def text_to_speech(
+        self,
+        text: str,
+        voice: str,
+        feature: str,
+        video_id: Optional[str] = None,
+        metadata: Optional[Dict] = None,
+        model: str = "tts-1",
+    ) -> Any:
+        """
+        Generate speech from text with automatic usage tracking.
+
+        Args:
+            text: Text to convert to speech
+            voice: Voice to use (alloy, echo, fable, onyx, nova, shimmer)
+            feature: Feature name for tracking (e.g., "weekly_summary_tts")
+            video_id: Associated video ID (optional)
+            metadata: Additional metadata (optional)
+            model: TTS model to use (tts-1 or tts-1-hd)
+
+        Returns:
+            OpenAI response object with audio data
+        """
+        try:
+            # Make API call
+            response = self.client.audio.speech.create(
+                model=model,
+                voice=voice,
+                input=text,
+                response_format="mp3",
+            )
+
+            # Track usage
+            # TTS is priced per character, not tokens
+            try:
+                tracking_metadata = metadata or {}
+                tracking_metadata["character_count"] = len(text)
+                tracking_metadata["voice"] = voice
+                tracking_metadata["audio_format"] = "mp3"
+
+                log_llm_usage(
+                    provider="openai",
+                    model=model,
+                    feature=feature,
+                    prompt_tokens=len(text),  # Store character count in prompt_tokens
+                    response_tokens=0,  # TTS doesn't have response tokens
+                    video_id=video_id,
+                    metadata=tracking_metadata,
+                )
+                logger.info(
+                    f"OpenAI TTS {model} call tracked for {feature} ({len(text)} chars)"
+                )
+            except Exception as e:
+                logger.warning(f"Failed to track OpenAI TTS usage: {e}")
+
+            return response
+
+        except Exception:
+            raise
+
 
 class TrackedGeminiClient:
     """
diff --git a/services/tts.py b/services/tts.py
@@ -7,10 +7,10 @@
 import re
 from typing import Optional, Literal
 from elevenlabs.client import ElevenLabs
-from openai import OpenAI
 from bs4 import BeautifulSoup
 
 from services.path_utils import expand_path
+from services.llm_clients import get_tracked_openai_client
 
 TTSProvider = Literal["openai", "elevenlabs"]
 
@@ -94,16 +94,21 @@ def extract_summary_text_for_tts(html_content: str) -> str:
 
 
 def _generate_audio_openai(
-    text: str, voice: str, api_key: str, model: str = "tts-1"
+    text: str,
+    voice: str,
+    model: str = "tts-1",
+    feature: str = "tts",
+    video_id: Optional[str] = None,
 ) -> bytes:
     """
-    Generate audio using OpenAI TTS API.
+    Generate audio using OpenAI TTS API with usage tracking.
 
     Args:
         text: Text to convert to speech
         voice: OpenAI voice (alloy, echo, fable, onyx, nova, shimmer)
-        api_key: OpenAI API key
         model: Model to use (tts-1 or tts-1-hd)
+        feature: Feature name for tracking (default: "tts")
+        video_id: Associated video ID for tracking (optional)
 
     Returns:
         MP3 audio data as bytes
@@ -122,10 +127,15 @@ def _generate_audio_openai(
         text = text[: max_chars - 3] + "..."
 
     try:
-        client = OpenAI(api_key=api_key)
+        # Use tracked client for automatic usage tracking
+        client = get_tracked_openai_client()
 
-        response = client.audio.speech.create(
-            model=model, voice=voice, input=text, response_format="mp3"
+        response = client.text_to_speech(
+            text=text,
+            voice=voice,
+            model=model,
+            feature=feature,
+            video_id=video_id,
         )
 
         # Read the audio data from the response
@@ -214,42 +224,50 @@ def _generate_audio_elevenlabs(
 
 def generate_audio(
     text: str,
-    api_key: str,
+    api_key: Optional[str] = None,
     provider: TTSProvider = "openai",
     voice: Optional[str] = None,
     model: Optional[str] = None,
+    feature: str = "tts",
+    video_id: Optional[str] = None,
 ) -> bytes:
     """
     Generate audio from text using the specified TTS provider.
 
     Args:
         text: Text to convert to speech
-        api_key: API key for the provider
+        api_key: API key for the provider (required for ElevenLabs, optional for OpenAI)
         provider: TTS provider to use ("openai" or "elevenlabs")
         voice: Voice ID/name (provider-specific)
         model: Model ID (provider-specific)
+        feature: Feature name for usage tracking (default: "tts")
+        video_id: Associated video ID for tracking (optional)
 
     Returns:
         MP3 audio data as bytes
 
     Raises:
         TTSAPIError: If API request fails
-        ValueError: If provider is invalid
+        ValueError: If provider is invalid or required parameters are missing
 
     Provider-specific defaults:
         OpenAI:
             - voice: "alloy" (options: alloy, echo, fable, onyx, nova, shimmer)
             - model: "tts-1" (options: tts-1, tts-1-hd)
+            - api_key: Read from config (automatic tracking)
         ElevenLabs:
             - voice: Must be provided
             - model: "eleven_flash_v2_5"
+            - api_key: Required
     """
     if provider == "openai":
         voice = voice or "alloy"
         model = model or "tts-1"
-        return _generate_audio_openai(text, voice, api_key, model)
+        return _generate_audio_openai(text, voice, model, feature, video_id)
 
     elif provider == "elevenlabs":
+        if not api_key:
+            raise ValueError("ElevenLabs requires api_key parameter")
         if not voice:
             raise ValueError("ElevenLabs requires a voice_id")
         model = model or "eleven_flash_v2_5"
diff --git a/services/weekly_summary.py b/services/weekly_summary.py
@@ -614,23 +614,24 @@ def _generate_and_attach_tts(
             if config.tts_provider == "openai":
                 if not config.openai_api_key:
                     raise ValueError("OpenAI API key not configured")
-                api_key = config.openai_api_key
-                voice = config.openai_tts_voice
-                model = config.openai_tts_model
+                audio_data = generate_audio(
+                    text=tts_text,
+                    provider="openai",
+                    voice=config.openai_tts_voice,
+                    model=config.openai_tts_model,
+                    feature="weekly_summary_tts",
+                )
             else:  # elevenlabs
                 if not config.elevenlabs_api_key:
                     raise ValueError("ElevenLabs API key not configured")
-                api_key = config.elevenlabs_api_key
-                voice = config.elevenlabs_voice_id
-                model = config.elevenlabs_model_id
-
-            audio_data = generate_audio(
-                text=tts_text,
-                api_key=api_key,
-                provider=config.tts_provider,
-                voice=voice,
-                model=model,
-            )
+                audio_data = generate_audio(
+                    text=tts_text,
+                    api_key=config.elevenlabs_api_key,
+                    provider="elevenlabs",
+                    voice=config.elevenlabs_voice_id,
+                    model=config.elevenlabs_model_id,
+                    feature="weekly_summary_tts",
+                )
             duration = save_audio_file(audio_data, audio_path)
             logger.info(f"Saved audio file: {audio_path} ({duration}s)")
 
diff --git a/templates/stats.html b/templates/stats.html
@@ -430,6 +430,7 @@ <h1><i class="fas fa-chart-line"></i> LLM Usage Statistics</h1>
                         <option value="transcription">Transcription</option>
                         <option value="summarization">Summarization</option>
                         <option value="weekly_summary">Weekly Summary</option>
+                        <option value="weekly_summary_tts">Weekly Summary TTS</option>
                         <option value="book_suggestions">Book Suggestions</option>
                     </select>
                 </div>
@@ -540,11 +541,13 @@ <h2><i class="fas fa-table"></i> Detailed Breakdown</h2>
         let providerChart = null;
         let featureChart = null;
 
-        // Model pricing (per 1M tokens)
+        // Model pricing (per 1M tokens/characters)
         const MODEL_PRICING = {
             // OpenAI models
             'whisper-1': { input: 0, output: 0 }, // Whisper is priced per minute, not tokens
             'voxtral-mini-latest': { input: 0, output: 0 }, // Voxtral is priced per minute, not tokens
+            'tts-1': { input: 15.00, output: 0 }, // TTS priced per character (stored in prompt_tokens)
+            'tts-1-hd': { input: 30.00, output: 0 }, // TTS HD priced per character (stored in prompt_tokens)
             'gpt-5-nano': { input: 0.05, output: 0.40 },
             'gpt-4o-mini': { input: 0.15, output: 0.60 },
             'gpt-4o': { input: 2.50, output: 10.00 },
@@ -832,7 +835,8 @@ <h2><i class="fas fa-table"></i> Detailed Breakdown</h2>
                 'summarization': { icon: 'fa-file-alt', name: 'Summarization', iconClass: 'summarization' },
                 'weekly_summary': { icon: 'fa-calendar-week', name: 'Weekly Summary', iconClass: 'weekly_summary' },
                 'book_suggestions': { icon: 'fa-book', name: 'Book Suggestions', iconClass: 'summarization' },
-                'tts': { icon: 'fa-volume-up', name: 'Text-to-Speech', iconClass: 'weekly_summary' }
+                'tts': { icon: 'fa-volume-up', name: 'Text-to-Speech', iconClass: 'weekly_summary' },
+                'weekly_summary_tts': { icon: 'fa-volume-up', name: 'Weekly Summary TTS', iconClass: 'weekly_summary' }
             };
 
             sortedFeatures.forEach(([feature, cost]) => {
diff --git a/tests/services/test_tts.py b/tests/services/test_tts.py
@@ -79,35 +79,35 @@ def test_case_insensitive_books_match(self):
 class TestGenerateAudio:
     """Tests for generate_audio function."""
 
-    @patch("services.tts.OpenAI")
-    def test_successful_generation_openai(self, mock_openai_class):
+    @patch("services.tts.get_tracked_openai_client")
+    def test_successful_generation_openai(self, mock_get_client):
         """Should generate audio successfully with OpenAI provider."""
         # Setup mock
         mock_client = Mock()
         mock_response = Mock()
         mock_response.read.return_value = b"fake audio data"
-        mock_client.audio.speech.create.return_value = mock_response
-        mock_openai_class.return_value = mock_client
+        mock_client.text_to_speech.return_value = mock_response
+        mock_get_client.return_value = mock_client
 
         # Call function
         result = generate_audio(
             text="Hello world",
-            api_key="test-api-key",
             provider="openai",
             voice="alloy",
             model="tts-1",
         )
 
         # Assertions
         assert result == b"fake audio data"
-        mock_openai_class.assert_called_once_with(api_key="test-api-key")
-        mock_client.audio.speech.create.assert_called_once()
+        mock_get_client.assert_called_once()
+        mock_client.text_to_speech.assert_called_once()
 
         # Check parameters
-        call_kwargs = mock_client.audio.speech.create.call_args.kwargs
-        assert call_kwargs["input"] == "Hello world"
+        call_kwargs = mock_client.text_to_speech.call_args.kwargs
+        assert call_kwargs["text"] == "Hello world"
         assert call_kwargs["voice"] == "alloy"
         assert call_kwargs["model"] == "tts-1"
+        assert call_kwargs["feature"] == "tts"
 
     @patch("services.tts.ElevenLabs")
     def test_successful_generation_elevenlabs(self, mock_elevenlabs_class):
@@ -141,29 +141,28 @@ def test_successful_generation_elevenlabs(self, mock_elevenlabs_class):
         assert call_kwargs["voice_id"] == "test-voice-id"
         assert call_kwargs["model_id"] == "eleven_flash_v2_5"
 
-    @patch("services.tts.OpenAI")
-    def test_truncates_long_text_openai(self, mock_openai_class):
+    @patch("services.tts.get_tracked_openai_client")
+    def test_truncates_long_text_openai(self, mock_get_client):
         """Should truncate text longer than OpenAI limit (4096 chars)."""
         # Setup mock
         mock_client = Mock()
         mock_response = Mock()
         mock_response.read.return_value = b"audio"
-        mock_client.audio.speech.create.return_value = mock_response
-        mock_openai_class.return_value = mock_client
+        mock_client.text_to_speech.return_value = mock_response
+        mock_get_client.return_value = mock_client
 
         # Generate long text (longer than 4096 chars)
         long_text = "x" * 5000
 
         # Call function
         generate_audio(
             text=long_text,
-            api_key="test-api-key",
             provider="openai",
         )
 
         # Check that text was truncated to 4096 chars
-        call_kwargs = mock_client.audio.speech.create.call_args.kwargs
-        posted_text = call_kwargs["input"]
+        call_kwargs = mock_client.text_to_speech.call_args.kwargs
+        posted_text = call_kwargs["text"]
         assert len(posted_text) == 4096
         assert posted_text.endswith("...")
 
@@ -195,19 +194,18 @@ def test_truncates_long_text_elevenlabs(self, mock_elevenlabs_class):
         assert len(posted_text) == 40000
         assert posted_text.endswith("...")
 
-    @patch("services.tts.OpenAI")
-    def test_raises_on_auth_error_openai(self, mock_openai_class):
+    @patch("services.tts.get_tracked_openai_client")
+    def test_raises_on_auth_error_openai(self, mock_get_client):
         """Should raise TTSAPIError on OpenAI authentication error."""
         # Setup mock to raise auth error
         mock_client = Mock()
-        mock_client.audio.speech.create.side_effect = Exception("401 Unauthorized")
-        mock_openai_class.return_value = mock_client
+        mock_client.text_to_speech.side_effect = Exception("401 Unauthorized")
+        mock_get_client.return_value = mock_client
 
         # Call function and expect error
         with pytest.raises(TTSAPIError, match="Invalid OpenAI API key"):
             generate_audio(
                 text="Test",
-                api_key="bad-key",
                 provider="openai",
             )