TTS in the llm cost

MrDesjardins · MrDesjardins · commit 18e046c3850b · 2026-02-09T19:26:17.000-08:00
diff --git a/config.py b/config.py
@@ -247,9 +247,14 @@ def validate(self) -> None:
                 errors.append(
                     "GEMINI_API_KEY is required when TRANSCRIPTION_PROVIDER=gemini"
                 )
+        elif self.transcription_provider == "mistral":
+            if not self.mistral_api_key:
+                errors.append(
+                    "MISTRAL_API_KEY is required when TRANSCRIPTION_PROVIDER=mistral"
+                )
         else:
             errors.append(
-                f"Invalid TRANSCRIPTION_PROVIDER: {self.transcription_provider}. Must be 'openai' or 'gemini'"
+                f"Invalid TRANSCRIPTION_PROVIDER: {self.transcription_provider}. Must be 'openai', 'gemini', or 'mistral'"
             )
 
         # Check summarization provider configuration
diff --git a/routes/transcription.py b/routes/transcription.py
@@ -2,7 +2,6 @@
 Transcription routes.
 """
 
-import os
 import logging
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import JSONResponse
@@ -12,6 +11,7 @@
     TranscriptionJob,
     JobStatus,
 )
+from services.path_utils import expand_path
 
 logger = logging.getLogger(__name__)
 router = APIRouter()
@@ -64,7 +64,7 @@ def start_transcription(video_id: str):
 
     audio_path = config.get_audio_path(video_id)
 
-    if not os.path.exists(audio_path):
+    if not expand_path(audio_path).exists():
         raise HTTPException(
             status_code=404,
             detail=f"Audio file not found for video {video_id}. Please stream the video first.",
diff --git a/services/database.py b/services/database.py
@@ -824,7 +824,8 @@ def get_llm_usage_summary(
                 SUM(prompt_tokens) as total_prompt_tokens,
                 SUM(response_tokens) as total_response_tokens,
                 SUM(reasoning_tokens) as total_reasoning_tokens,
-                SUM(total_tokens) as total_tokens
+                SUM(total_tokens) as total_tokens,
+                SUM(audio_duration_seconds) as total_audio_duration_seconds
             FROM llm_usage_stats
             WHERE 1=1
         """
@@ -864,6 +865,8 @@ def get_llm_usage_summary(
                 "total_response_tokens": row["total_response_tokens"] or 0,
                 "total_reasoning_tokens": row["total_reasoning_tokens"] or 0,
                 "total_tokens": row["total_tokens"] or 0,
+                "total_audio_duration_seconds": row["total_audio_duration_seconds"]
+                or 0,
             }
             results["by_provider_model_feature"].append(stat)
 
diff --git a/services/tts.py b/services/tts.py
@@ -5,12 +5,16 @@
 """
 
 import re
+import logging
 from typing import Optional, Literal
 from elevenlabs.client import ElevenLabs
 from bs4 import BeautifulSoup
 
 from services.path_utils import expand_path
 from services.llm_clients import get_tracked_openai_client
+from services.database import log_llm_usage
+
+logger = logging.getLogger(__name__)
 
 TTSProvider = Literal["openai", "elevenlabs"]
 
@@ -160,16 +164,20 @@ def _generate_audio_elevenlabs(
     api_key: str,
     model_id: str = "eleven_flash_v2_5",
     output_format: str = "mp3_44100_128",
+    feature: str = "tts",
+    video_id: Optional[str] = None,
 ) -> bytes:
     """
-    Generate audio using ElevenLabs API.
+    Generate audio using ElevenLabs API with usage tracking.
 
     Args:
         text: Text to convert to speech
         voice_id: ElevenLabs voice ID
         api_key: ElevenLabs API key
         model_id: Model to use for generation
         output_format: Audio format
+        feature: Feature name for tracking (default: "tts")
+        video_id: Associated video ID for tracking (optional)
 
     Returns:
         MP3 audio data as bytes
@@ -205,7 +213,32 @@ def _generate_audio_elevenlabs(
             output_format=output_format,
         )
 
-        return b"".join(audio_generator)
+        audio_data = b"".join(audio_generator)
+
+        # Track usage - ElevenLabs TTS priced per character
+        try:
+            metadata = {
+                "character_count": len(text),
+                "voice_id": voice_id,
+                "output_format": output_format,
+            }
+
+            log_llm_usage(
+                provider="elevenlabs",
+                model=model_id,
+                feature=feature,
+                prompt_tokens=len(text),  # Store character count in prompt_tokens
+                response_tokens=0,  # TTS doesn't have response tokens
+                video_id=video_id,
+                metadata=metadata,
+            )
+            logger.info(
+                f"ElevenLabs TTS {model_id} call tracked for {feature} ({len(text)} chars)"
+            )
+        except Exception as e:
+            logger.warning(f"Failed to track ElevenLabs TTS usage: {e}")
+
+        return audio_data
 
     except Exception as e:
         error_msg = str(e)
@@ -271,7 +304,9 @@ def generate_audio(
         if not voice:
             raise ValueError("ElevenLabs requires a voice_id")
         model = model or "eleven_flash_v2_5"
-        return _generate_audio_elevenlabs(text, voice, api_key, model)
+        return _generate_audio_elevenlabs(
+            text, voice, api_key, model, feature=feature, video_id=video_id
+        )
 
     else:
         raise ValueError(f"Unsupported TTS provider: {provider}")
diff --git a/templates/stats.html b/templates/stats.html
@@ -552,6 +552,11 @@ <h2><i class="fas fa-table"></i> Detailed Breakdown</h2>
             'gpt-4o-mini': { input: 0.15, output: 0.60 },
             'gpt-4o': { input: 2.50, output: 10.00 },
             'gpt-5.2': { input: 1.75, output: 14.00 },
+            // ElevenLabs TTS models (priced per character, stored in prompt_tokens)
+            'eleven_flash_v2_5': { input: 100.00, output: 0 }, // $0.10 per 1K chars
+            'eleven_turbo_v2_5': { input: 300.00, output: 0 }, // $0.30 per 1K chars
+            'eleven_multilingual_v2': { input: 300.00, output: 0 }, // $0.30 per 1K chars
+            'eleven_monolingual_v1': { input: 300.00, output: 0 }, // $0.30 per 1K chars
             // Gemini models (text/summarization)
             'gemini-2.5-flash': { input: 0.15, output: 0.60 },
             'gemini-2.5-flash-preview-tts': { input: 0.15, output: 0.60 },
@@ -567,16 +572,31 @@ <h2><i class="fas fa-table"></i> Detailed Breakdown</h2>
             output: 0.40
         };
 
+        // Audio transcription pricing (per second)
+        const AUDIO_PRICING_PER_SECOND = {
+            'whisper-1': 0.0001,  // $0.006 per minute / 60
+            'voxtral-mini-latest': 0.00005,  // $0.003 per minute / 60
+        };
+
         function calculateCost(summary) {
             let totalCost = 0;
 
             summary.by_provider_model_feature.forEach(item => {
                 const model = item.model;
                 const feature = item.feature;
                 const provider = item.provider;
-                let pricing = MODEL_PRICING[model];
+                const audioDuration = item.total_audio_duration_seconds || 0;
+
+                // Check if this model uses per-second audio pricing
+                if (AUDIO_PRICING_PER_SECOND[model] && audioDuration > 0) {
+                    const cost = audioDuration * AUDIO_PRICING_PER_SECOND[model];
+                    totalCost += cost;
+                    console.log(`${provider}/${model}/${feature}: ${(audioDuration / 60).toFixed(2)} minutes = $${cost.toFixed(4)}`);
+                    return;
+                }
 
                 // Special handling for Gemini audio transcription
+                let pricing = MODEL_PRICING[model];
                 if (provider === 'gemini' && feature === 'transcription') {
                     pricing = GEMINI_AUDIO_PRICING;
                     console.log(`Using Gemini audio pricing for ${model} transcription`);
@@ -803,9 +823,18 @@ <h2><i class="fas fa-table"></i> Detailed Breakdown</h2>
                 const feature = item.feature;
                 const model = item.model;
                 const provider = item.provider;
-                let pricing = MODEL_PRICING[model];
+                const audioDuration = item.total_audio_duration_seconds || 0;
+
+                // Check if this model uses per-second audio pricing
+                if (AUDIO_PRICING_PER_SECOND[model] && audioDuration > 0) {
+                    const cost = audioDuration * AUDIO_PRICING_PER_SECOND[model];
+                    featureCosts[feature] = (featureCosts[feature] || 0) + cost;
+                    featureCalls[feature] = (featureCalls[feature] || 0) + item.call_count;
+                    return;
+                }
 
                 // Special handling for Gemini audio transcription
+                let pricing = MODEL_PRICING[model];
                 if (provider === 'gemini' && feature === 'transcription') {
                     pricing = GEMINI_AUDIO_PRICING;
                 }
diff --git a/tests/routes/test_transcription.py b/tests/routes/test_transcription.py
@@ -155,15 +155,19 @@ def test_start_transcription_audio_not_found(
         assert "Audio file not found" in response.json()["detail"]
 
     @patch("routes.transcription.get_transcription_queue")
-    @patch("routes.transcription.os.path.exists")
+    @patch("routes.transcription.expand_path")
     @patch("routes.transcription.config")
     def test_start_transcription_success(
-        self, mock_config, mock_exists, mock_get_queue, client
+        self, mock_config, mock_expand_path, mock_get_queue, client
     ):
         """Test successful transcription start."""
         mock_config.transcription_enabled = True
         mock_config.get_audio_path.return_value = "/tmp/test123.mp3"
-        mock_exists.return_value = True
+
+        # Mock expand_path to return a Mock Path object with exists() returning True
+        mock_path = Mock()
+        mock_path.exists.return_value = True
+        mock_expand_path.return_value = mock_path
 
         mock_queue = Mock()
         mock_get_queue.return_value = mock_queue
@@ -182,15 +186,20 @@ def test_start_transcription_success(
         assert job.audio_path == "/tmp/test123.mp3"
 
     @patch("routes.transcription.get_transcription_queue")
-    @patch("routes.transcription.os.path.exists")
+    @patch("routes.transcription.expand_path")
     @patch("routes.transcription.config")
     def test_start_transcription_error(
-        self, mock_config, mock_exists, mock_get_queue, client
+        self, mock_config, mock_expand_path, mock_get_queue, client
     ):
         """Test start transcription with error."""
         mock_config.transcription_enabled = True
         mock_config.get_audio_path.return_value = "/tmp/test123.mp3"
-        mock_exists.return_value = True
+
+        # Mock expand_path to return a Mock Path object with exists() returning True
+        mock_path = Mock()
+        mock_path.exists.return_value = True
+        mock_expand_path.return_value = mock_path
+
         mock_get_queue.side_effect = Exception("Queue error")
 
         response = client.post("/transcription/start/test123")
diff --git a/update.sh b/update.sh
@@ -66,6 +66,9 @@ echo "----------------------------------------"
 uv run python migrate_database.py
 uv run python migrate_add_metadata.py
 uv run python migrate_add_queue_columns.py
+uv run python migrate_add_llm_stats.py
+uv run python migrate_add_audio_duration.py
+uv run python migrate_add_weekly_summary.py
 
 # Then initialize/update schema (creates tables if they don't exist)
 uv run python -c "from services.database import init_database; init_database(); print('Database schema updated successfully')"