MrDesjardins
diff --git a/‎.env.example‎
Lines changed: 23 additions & 1 deletion b/‎.env.example‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 75 additions & 59 deletions b/‎README.md‎
Lines changed: 75 additions & 59 deletions
diff --git a/‎config.py‎
Lines changed: 27 additions & 3 deletions b/‎config.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎main.py‎
Lines changed: 20 additions & 0 deletions b/‎main.py‎
Lines changed: 20 additions & 0 deletions
@@ -122,14 +122,36 @@ WEEKLY_SUMMARY_PROVIDER=gemini
 # WEEKLY_SUMMARY_MODEL=gemini-2.5-flash
 
 # Text-to-Speech (Optional)
-# Enable TTS generation for weekly summaries using ElevenLabs
+# Enable TTS generation for weekly summaries
 # Requires WEEKLY_SUMMARY_ENABLED=true
 TTS_ENABLED=false
+
+# TTS Provider
+# Options: "openai" or "elevenlabs"
+# OpenAI: Standard $15/1M chars (~$0.15 for 10K), HD $30/1M chars
+#         6 voices (alloy, echo, fable, onyx, nova, shimmer)
+#         Good quality, very affordable for long content
+# ElevenLabs: Credit-based pricing, higher quality voices
+#             More expensive for long-form content
+# Default: openai (recommended for cost)
+TTS_PROVIDER=openai
+
+# OpenAI TTS Settings (when TTS_PROVIDER=openai)
+# Voice options: alloy, echo, fable, onyx, nova, shimmer
+# Model options: tts-1 (faster, cheaper), tts-1-hd (higher quality)
+OPENAI_TTS_VOICE=alloy
+OPENAI_TTS_MODEL=tts-1
+
+# ElevenLabs Settings (when TTS_PROVIDER=elevenlabs)
 # Get your API key from: https://elevenlabs.io/
 ELEVENLABS_API_KEY=
 # Voice ID to use for TTS (default: Adam - free deep American male voice)
 # Free voices: Adam=pNInz6obpgDQGcFmaJgB, Rachel=21m00Tcm4TlvDq8ikWAM
+# Run: uv run python list_elevenlabs_voices.py to see all available voices
 ELEVENLABS_VOICE_ID=pNInz6obpgDQGcFmaJgB
+# Model options: eleven_flash_v2_5 (40K chars), eleven_turbo_v2_5 (40K), eleven_multilingual_v2 (10K)
+ELEVENLABS_MODEL_ID=eleven_flash_v2_5
+
 # Directory to store weekly summary audio files
 WEEKLY_SUMMARY_AUDIO_DIR=/var/audio-summaries
 
 
@@ -28,7 +28,7 @@ A powerful FastAPI application that streams audio from YouTube videos as MP3 ove
 
 #### Intelligent Summarization
 - **Video Summaries**: AI-generated summaries of each video's content
-- **Multi-Provider**: OpenAI GPT or Google Gemini (Gemini recommended for cost-effectiveness)
+- **Multi-Provider**: OpenAI GPT or Google Gemini (Gemini recommended for free tier)
 - **Knowledge Management**: Automatic posting to Trilium Notes with deduplication
 - **Rich Metadata**: Includes video title, channel, thumbnail, and YouTube link
 
@@ -37,7 +37,7 @@ A powerful FastAPI application that streams audio from YouTube videos as MP3 ove
 - **Comprehensive Analysis**: Synthesizes all videos watched during the week
 - **Key Learnings**: Extracts 15 most important insights across all content
 - **Theme Detection**: Identifies common themes and patterns in your viewing
-- **Text-to-Speech**: Optional ElevenLabs TTS generation for listening to summaries
+- **Text-to-Speech**: Optional TTS generation (OpenAI or ElevenLabs) for listening to summaries
 
 #### Smart Video Suggestions
 - **AI Content Discovery**: Analyzes your viewing history to suggest similar videos
@@ -125,9 +125,9 @@ TRANSCRIPTION_ENABLED=true
 OPENAI_API_KEY=sk-...  # Get from https://platform.openai.com/api-keys
 GEMINI_API_KEY=...     # Get from https://makersuite.google.com/app/apikey
 
-# Provider selection (recommended: Voxtral + Gemini for best cost/quality)
-TRANSCRIPTION_PROVIDER=mistral  # "openai", "mistral", or "gemini"
-SUMMARY_PROVIDER=gemini         # "gemini" (cost-effective) or "openai"
+# Provider selection (recommended: Whisper + Gemini for best cost/quality)
+TRANSCRIPTION_PROVIDER=openai  # "openai" (Whisper) or "gemini"
+SUMMARY_PROVIDER=gemini        # "gemini" (free tier) or "openai"
 
 # Trilium Notes integration (for saving summaries)
 TRILIUM_URL=http://localhost:8080
@@ -148,13 +148,13 @@ TTS_ENABLED=false
   - Or use your specific local IP (e.g., `10.0.0.181`)
 
 - **TRANSCRIPTION_PROVIDER**:
-  - `openai` = Whisper API ($0.006/min, very accurate, fast, 25MB limit)
-  - `mistral` = Voxtral Mini ($0.003/min, most cost-effective, 30 min limit)
-  - `gemini` = Gemini 2.5 Flash (~$0.0005-0.001/min, handles unlimited file sizes)
+  - `openai` = Whisper API ($0.006/minute, very accurate, fast, 25MB limit)
+  - `mistral` = Voxtral Mini ($0.003/minute, cost-effective, good quality, 15 min limit)
+  - `gemini` = Gemini 1.5 Flash (free tier available, good quality, no limits)
 
 - **SUMMARY_PROVIDER**:
-  - `gemini` = Gemini 2.5 Flash (recommended, very cost-effective, fast)
-  - `openai` = GPT-4o-mini (high quality)
+  - `gemini` = Gemini 2.5 Flash (recommended, free tier, fast)
+  - `openai` = GPT-4o-mini (high quality, paid)
 
 ### Step 4: Test Trilium Connection (Optional)
 
@@ -224,24 +224,20 @@ Required for Whisper transcription or GPT summarization.
 
 ### Google Gemini API Key
 
-Required for Gemini transcription or summarization. Very cost-effective pricing.
+Required for Gemini transcription or summarization. Has a generous free tier.
 
 1. Visit https://makersuite.google.com/app/apikey
 2. Sign in with your Google account
 3. Click "Create API Key"
 4. Copy the key
 5. Add to `.env` file: `GEMINI_API_KEY=...`
 
-**Pricing**:
-- Audio transcription: ~$0.30-0.50 per 1M input tokens + $0.40 per 1M output
-- Text generation: $0.15 per 1M input + $0.60 per 1M output
-- Rate limits: 15 req/min, 1,500 req/day, 1M tokens/day
+**Free Tier**:
+- 15 requests per minute
+- 1 million tokens per day
+- 1,500 requests per day
 
-**Benefits**:
-- Very cost-effective for audio transcription (~$0.0005-0.001/minute)
-- Handles large audio files automatically (uses Files API for >20MB)
-- No practical file size or duration limits
-- Good for long recordings where Whisper/Voxtral hit their limits
+For typical use, summarization and weekly summaries are essentially free.
 
 ### Mistral AI API Key
 
@@ -255,7 +251,7 @@ Required for Mistral Voxtral transcription. Cost-effective option at $0.003/minu
 
 **Cost**: Voxtral Mini is $0.003 per minute of audio. For typical use (~30 hours/month), expect ~$5-8/month (50% cheaper than Whisper).
 
-**Limitation**: Maximum 15 minutes per audio file. For longer videos, use Gemini (no limit) or split the audio.
+**Limitation**: Maximum 30 minutes per audio file. For longer videos, use Gemini (no limit) or split the audio.
 
 ### Trilium ETAPI Token
 
@@ -272,9 +268,27 @@ Required for saving transcripts and summaries to Trilium Notes.
 2. Right-click the note → "Copy Note ID"
 3. Add to `.env` file: `TRILIUM_PARENT_NOTE_ID=...`
 
-### ElevenLabs API Key (Optional)
+### Text-to-Speech API Keys (Optional)
 
-Required only if you want text-to-speech for weekly summaries.
+Required only if you want text-to-speech for weekly summaries. Choose one provider:
+
+#### OpenAI TTS (Recommended)
+**Most affordable for long-form content**
+
+- Pricing: $15 per 1M characters (~$0.15 for a 10K character summary)
+- Quality: 6 natural voices (alloy, echo, fable, onyx, nova, shimmer)
+- Models: `tts-1` (standard) or `tts-1-hd` (higher quality)
+- You already have the API key from transcription setup
+
+Set in `.env`:
+```bash
+TTS_PROVIDER=openai
+OPENAI_TTS_VOICE=alloy
+OPENAI_TTS_MODEL=tts-1
+```
+
+#### ElevenLabs (Alternative)
+**Higher quality voices, more expensive**
 
 1. Visit https://elevenlabs.io/
 2. Sign up or sign in
@@ -284,6 +298,12 @@ Required only if you want text-to-speech for weekly summaries.
 
 **Free Tier**: 10,000 characters per month (~7-10 summaries)
 
+Set in `.env`:
+```bash
+TTS_PROVIDER=elevenlabs
+ELEVENLABS_VOICE_ID=pNInz6obpgDQGcFmaJgB
+```
+
 ## Configuration Reference
 
 ### Environment Variables
@@ -347,9 +367,13 @@ All configuration is done via the `.env` file. See `.env.example` for a complete
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `TTS_ENABLED` | `false` | Enable ElevenLabs TTS for summaries |
-| `ELEVENLABS_API_KEY` | - | ElevenLabs API key |
-| `ELEVENLABS_VOICE_ID` | `pNInz6obpgDQGcFmaJgB` | Voice ID (Adam by default) |
+| `TTS_ENABLED` | `false` | Enable TTS for summaries |
+| `TTS_PROVIDER` | `openai` | Provider: `openai` or `elevenlabs` |
+| `OPENAI_TTS_VOICE` | `alloy` | OpenAI voice (alloy, echo, fable, onyx, nova, shimmer) |
+| `OPENAI_TTS_MODEL` | `tts-1` | OpenAI model (`tts-1` or `tts-1-hd`) |
+| `ELEVENLABS_API_KEY` | - | ElevenLabs API key (if using ElevenLabs) |
+| `ELEVENLABS_VOICE_ID` | `pNInz6obpgDQGcFmaJgB` | ElevenLabs voice ID (Adam by default) |
+| `ELEVENLABS_MODEL_ID` | `eleven_flash_v2_5` | ElevenLabs model |
 | `WEEKLY_SUMMARY_AUDIO_DIR` | `/var/audio-summaries` | Where to store TTS audio files |
 
 ## API Endpoints
@@ -530,36 +554,29 @@ curl "http://localhost:8000/admin/weekly-summary/next-run"
 | gpt-4o | $2.50 | $10.00 | Higher quality |
 | whisper-1 | - | - | $0.006 per minute, 25MB limit |
 | **Mistral AI** ||||
-| voxtral-mini-latest | - | - | $0.003 per minute, 30 min limit |
+| voxtral-mini-latest | - | - | $0.003 per minute, 15 min limit |
 | **Google Gemini** ||||
-| gemini-2.5-flash | $0.15 | $0.60 | Text: Fast, comparable to gpt-4o-mini |
-| gemini-2.5-flash (audio) | $0.30-0.50 | $0.40 | Audio transcription (token-based) |
+| gemini-2.5-flash | $0.15 | $0.60 | Fast, comparable to gpt-4o-mini (recommended) |
 | gemini-1.5-flash | $0.10 | $0.40 | Slightly older, still excellent |
 | gemini-1.5-pro | $1.25 | $5.00 | Higher quality |
 
-**Note**: Gemini audio pricing is per 1M tokens. Audio duration to token conversion varies, but typically ~1 minute ≈ 1,000-1,500 tokens.
-
 ### Estimated Costs Per Operation
 
-**Transcription Options:**
+**Using recommended configuration (Whisper + Gemini 2.5 Flash):**
 
-1. **Whisper (OpenAI)** - Most accurate
-   - $0.006 per minute
-   - 10 min = $0.06 | 1 hour = $0.36
+- **Video transcription** (Whisper): $0.006 per minute of audio
+  - 10 min video = $0.06
+  - 1 hour video = $0.36
 
-2. **Voxtral (Mistral)** - Cost-effective (50% cheaper)
-   - $0.003 per minute
-   - 10 min = $0.03 | 1 hour = $0.18
+**Alternative: Cost-optimized (Voxtral + Gemini 2.5 Flash):**
 
-3. **Gemini 2.5 Flash** - Token-based, good for long files
-   - ~$0.30-0.50 per 1M input tokens + $0.40 per 1M output
-   - Estimate: ~$0.0005-0.001 per minute (varies by audio complexity)
-   - 10 min ≈ $0.005-0.01 | 1 hour ≈ $0.03-0.06
-   - Best for: Very long recordings, handles unlimited file sizes
+- **Video transcription** (Voxtral Mini): $0.003 per minute of audio (50% cheaper)
+  - 10 min video = $0.03
+  - 1 hour video = $0.18
 
-**Summarization** (Gemini 2.5 Flash text):
-- Typical: 2,000 input + 500 output tokens
-- Cost: (2,000 × $0.15 + 500 × $0.60) / 1,000,000 = **$0.0006**
+- **Video summarization** (Gemini 2.5 Flash): ~$0.0003-0.001 per summary
+  - Typical: 2,000 input tokens + 500 output tokens
+  - Cost: (2,000 × $0.15 + 500 × $0.60) / 1,000,000 = **$0.0006**
 
 - **Weekly summary** (Gemini 2.5 Flash): ~$0.003-0.01 per summary
   - Typical: 10,000 input tokens + 2,000 output tokens
@@ -589,21 +606,20 @@ curl "http://localhost:8000/admin/weekly-summary/next-run"
 - Weekly summaries: 4 weeks × $0.0027 = **$0.01**
 - **Total: ~$36.10/month**
 
-### Gemini Pricing Advantages
+### Gemini Free Tier
 
-Gemini offers very competitive pricing, especially for text generation:
-- Text: $0.15 input + $0.60 output per 1M tokens
-- Audio: $0.30-0.50 input + $0.40 output per 1M tokens
-- Rate limits: 15 req/min, 1M tokens/day, 1,500 req/day
+Gemini has a generous free tier that covers most summarization needs:
+- 15 requests per minute
+- 1 million tokens per day
+- 1,500 requests per day
 
-**Cost-effective for:**
-- Video summarization (~$0.0006 per summary - nearly negligible)
-- Weekly summaries (~$0.003 per summary)
-- Smart suggestions (~$0.002 per request)
-- Long audio transcriptions (~$0.0005-0.001 per minute, cheaper than Whisper for >6 min files)
+**What's free:**
+- Video summarization (essentially unlimited for personal use)
+- Weekly summaries (4 per month)
+- Smart suggestions (as much as you need)
 
-**Costs more than alternatives:**
-- Short audio transcription: Voxtral is more cost-effective ($0.003/min fixed)
+**What costs money:**
+- Transcription with Whisper (no free option for high quality)
 
 ### Cost Tracking
 
@@ -671,7 +687,7 @@ sudo systemctl restart audio-stream
 sudo systemctl stop audio-stream
 
 # View logs
-journalctl -u audio-stream -n 100 -f
+journalctl -u audio-stream -n 1000 -f
 ```
 
 **Note:** The service automatically loads your `.env` file from the WorkingDirectory.
 
@@ -3,7 +3,7 @@
 import os
 import threading
 import logging
-from typing import Optional
+from typing import Optional, Literal, cast
 from dataclasses import dataclass
 from dotenv import load_dotenv
 
@@ -104,8 +104,12 @@ class Config:
 
     # TTS settings
     tts_enabled: bool
+    tts_provider: Literal["openai", "elevenlabs"]
+    openai_tts_voice: str  # OpenAI voice (alloy, echo, fable, onyx, nova, shimmer)
+    openai_tts_model: str  # OpenAI model (tts-1 or tts-1-hd)
     elevenlabs_api_key: Optional[str]
     elevenlabs_voice_id: str
+    elevenlabs_model_id: str
     weekly_summary_audio_dir: str
 
     # Client-side logging settings
@@ -194,10 +198,17 @@ def load_from_env(cls) -> "Config":
             == "true",
             # TTS settings
             tts_enabled=os.getenv("TTS_ENABLED", "false").lower() == "true",
+            tts_provider=cast(
+                Literal["openai", "elevenlabs"],
+                os.getenv("TTS_PROVIDER", "openai").lower(),
+            ),
+            openai_tts_voice=os.getenv("OPENAI_TTS_VOICE", "alloy"),
+            openai_tts_model=os.getenv("OPENAI_TTS_MODEL", "tts-1"),
             elevenlabs_api_key=os.getenv("ELEVENLABS_API_KEY"),
             elevenlabs_voice_id=os.getenv(
                 "ELEVENLABS_VOICE_ID", "pNInz6obpgDQGcFmaJgB"
             ),  # Adam - free voice
+            elevenlabs_model_id=os.getenv("ELEVENLABS_MODEL_ID", "eleven_flash_v2_5"),
             weekly_summary_audio_dir=os.getenv(
                 "WEEKLY_SUMMARY_AUDIO_DIR", "/var/audio-summaries"
             ),
@@ -328,8 +339,21 @@ def validate_tts(self) -> None:
         """Validate that required configuration for TTS is present."""
         errors = []
 
-        if not self.elevenlabs_api_key:
-            errors.append("ELEVENLABS_API_KEY is required when TTS_ENABLED=true")
+        # Validate TTS provider
+        if self.tts_provider not in ["openai", "elevenlabs"]:
+            errors.append(
+                f"TTS_PROVIDER must be 'openai' or 'elevenlabs', got '{self.tts_provider}'"
+            )
+
+        # Validate provider-specific configuration
+        if self.tts_provider == "openai":
+            if not self.openai_api_key:
+                errors.append("OPENAI_API_KEY is required when TTS_PROVIDER=openai")
+        elif self.tts_provider == "elevenlabs":
+            if not self.elevenlabs_api_key:
+                errors.append(
+                    "ELEVENLABS_API_KEY is required when TTS_PROVIDER=elevenlabs"
+                )
 
         if errors:
             error_msg = "TTS configuration validation failed:\n  - " + "\n  - ".join(
 
@@ -36,6 +36,26 @@
 )
 logger = logging.getLogger(__name__)
 
+
+# Custom filter to suppress polling endpoint logs
+class PollingEndpointFilter(logging.Filter):
+    """Filter out frequently polled endpoint access logs to reduce noise."""
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        message = record.getMessage()
+        # Filter out frequently polled endpoints
+        return not any(
+            pattern in message
+            for pattern in [
+                "GET /status HTTP",  # Stream status polling
+                "GET /transcription/status/",  # Transcription status polling
+            ]
+        )
+
+
+# Apply filter to uvicorn access logger
+logging.getLogger("uvicorn.access").addFilter(PollingEndpointFilter())
+
 # Configurable host and port
 host = os.environ.get("FASTAPI_HOST", "127.0.0.1")
 api_port = int(os.environ.get("FASTAPI_API_PORT", 8000))