feat(voice-server): add TTS provider abstraction with fallback order

zitongcharliedeng · claude · zitongcharliedeng · commit 3e1f8956cf5e · 2025-12-09T17:36:27.000Z
Adds a pluggable TTS provider system that allows multiple backends with configurable fallback order. Includes cross-platform audio playback support. Providers: - ElevenLabs (cloud) - existing functionality extracted to provider class - Piper (local) - free offline TTS using neural voice models - MacOS (local) - native macOS say command Cross-platform audio: - macOS: afplay - Linux: aplay - WSL: powershell Media.SoundPlayer Configuration via config.json with fallback order: {providers: [piper, elevenlabs, macos]} First available provider is used automatically. Falls back to direct ElevenLabs API if no provider configured (backwards compatible). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.claude/voice-server/config.json b/.claude/voice-server/config.json
@@ -0,0 +1,3 @@
+{
+  "providers": ["piper", "elevenlabs", "macos-say"]
+}
diff --git a/.claude/voice-server/providers/ElevenLabs.ts b/.claude/voice-server/providers/ElevenLabs.ts
@@ -0,0 +1,65 @@
+import { spawn } from 'child_process';
+import type { TTSProvider } from './index';
+
+const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY;
+const DEFAULT_VOICE_ID = process.env.ELEVENLABS_VOICE_ID || 's3TPKV1kjDlVtZbl4Ksh';
+const DEFAULT_MODEL = process.env.ELEVENLABS_MODEL || 'eleven_multilingual_v2';
+
+export class ElevenLabs implements TTSProvider {
+  readonly name = 'elevenlabs';
+
+  isAvailable(): boolean {
+    return !!ELEVENLABS_API_KEY;
+  }
+
+  async speak(text: string, voiceId?: string): Promise<void> {
+    if (!ELEVENLABS_API_KEY) {
+      throw new Error('ElevenLabs API key not configured');
+    }
+
+    const voice = voiceId || DEFAULT_VOICE_ID;
+    const url = `https://api.elevenlabs.io/v1/text-to-speech/${voice}`;
+
+    const response = await fetch(url, {
+      method: 'POST',
+      headers: {
+        'Accept': 'audio/mpeg',
+        'Content-Type': 'application/json',
+        'xi-api-key': ELEVENLABS_API_KEY,
+      },
+      body: JSON.stringify({
+        text: text,
+        model_id: DEFAULT_MODEL,
+        voice_settings: {
+          stability: 0.5,
+          similarity_boost: 0.5,
+        },
+      }),
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      if (errorText.includes('model') || response.status === 422) {
+        throw new Error(`ElevenLabs API error: Invalid model "${DEFAULT_MODEL}". Update ELEVENLABS_MODEL in ~/.env. See https://elevenlabs.io/docs/models`);
+      }
+      throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
+    }
+
+    const audioBuffer = await response.arrayBuffer();
+    const tempFile = `/tmp/voice-${Date.now()}.mp3`;
+    await Bun.write(tempFile, audioBuffer);
+
+    return new Promise((resolve, reject) => {
+      const proc = spawn('/usr/bin/afplay', [tempFile]);
+      proc.on('error', (error) => {
+        console.error('Error playing audio:', error);
+        reject(error);
+      });
+      proc.on('exit', (code) => {
+        spawn('/bin/rm', [tempFile]);
+        if (code === 0) resolve();
+        else reject(new Error(`afplay exited with code ${code}`));
+      });
+    });
+  }
+}
diff --git a/.claude/voice-server/providers/MacOSSay.ts b/.claude/voice-server/providers/MacOSSay.ts
@@ -0,0 +1,23 @@
+import { spawn } from 'child_process';
+import { platform } from 'os';
+import type { TTSProvider } from './index';
+
+export class MacOSSay implements TTSProvider {
+  readonly name = 'macos-say';
+
+  isAvailable(): boolean {
+    return platform() === 'darwin';
+  }
+
+  async speak(text: string, voiceId?: string): Promise<void> {
+    const voice = voiceId || 'Samantha';
+
+    return new Promise((resolve, reject) => {
+      const proc = spawn('/usr/bin/say', ['-v', voice, text]);
+      proc.on('error', reject);
+      proc.on('exit', (code) => {
+        code === 0 ? resolve() : reject(new Error(`say exited with code ${code}`));
+      });
+    });
+  }
+}
diff --git a/.claude/voice-server/providers/Piper.ts b/.claude/voice-server/providers/Piper.ts
@@ -0,0 +1,108 @@
+import { spawn, spawnSync } from 'child_process';
+import { existsSync, readFileSync } from 'fs';
+import { join } from 'path';
+import { platform, release } from 'os';
+import type { TTSProvider } from './index';
+
+const IS_WSL = platform() === 'linux' && release().toLowerCase().includes('microsoft');
+
+interface VoiceConfig {
+  model: string;
+  speaker: number;
+}
+
+export class Piper implements TTSProvider {
+  readonly name = 'piper';
+  private baseDir: string;
+  private binary: string;
+  private modelsDir: string;
+  private voices: Record<string, VoiceConfig> = {};
+
+  constructor(baseDir?: string) {
+    this.baseDir = baseDir || join(import.meta.dir, '..');
+    const configPath = join(this.baseDir, 'voices.json');
+
+    if (existsSync(configPath)) {
+      const config = JSON.parse(readFileSync(configPath, 'utf-8'));
+      this.binary = join(this.baseDir, config.piper?.binary || 'piper-bin/piper/piper');
+      this.modelsDir = join(this.baseDir, config.piper?.models_dir || 'piper-voices');
+      this.voices = config.voices || {};
+    } else {
+      this.binary = join(this.baseDir, 'piper-bin/piper/piper');
+      this.modelsDir = join(this.baseDir, 'piper-voices');
+    }
+  }
+
+  isAvailable(): boolean {
+    return existsSync(this.binary);
+  }
+
+  async speak(text: string, voiceId?: string): Promise<void> {
+    const voice = this.voices[voiceId || 'default'] || { model: 'en_US-libritts_r-medium', speaker: 0 };
+    const modelPath = join(this.modelsDir, `${voice.model}.onnx`);
+
+    if (!existsSync(modelPath)) {
+      throw new Error(`Piper model not found: ${modelPath}`);
+    }
+
+    const result = spawnSync(this.binary, [
+      '--model', modelPath,
+      '--speaker', voice.speaker.toString(),
+      '--output-raw'
+    ], { input: text, maxBuffer: 10 * 1024 * 1024 });
+
+    if (result.error) throw new Error(`Piper error: ${result.error.message}`);
+    if (result.status !== 0) throw new Error(`Piper failed: ${result.stderr?.toString()}`);
+
+    const wavBuffer = this.pcmToWav(result.stdout);
+    await this.playAudio(wavBuffer);
+  }
+
+  private pcmToWav(pcm: Buffer): Buffer {
+    const header = Buffer.alloc(44);
+    header.write('RIFF', 0);
+    header.writeUInt32LE(36 + pcm.length, 4);
+    header.write('WAVE', 8);
+    header.write('fmt ', 12);
+    header.writeUInt32LE(16, 16);
+    header.writeUInt16LE(1, 20);
+    header.writeUInt16LE(1, 22);
+    header.writeUInt32LE(22050, 24);
+    header.writeUInt32LE(44100, 28);
+    header.writeUInt16LE(2, 32);
+    header.writeUInt16LE(16, 34);
+    header.write('data', 36);
+    header.writeUInt32LE(pcm.length, 40);
+    return Buffer.concat([header, pcm]);
+  }
+
+  private async playAudio(wav: Buffer): Promise<void> {
+    const tempFile = `/tmp/voice-${Date.now()}.wav`;
+    await Bun.write(tempFile, wav);
+
+    if (IS_WSL) {
+      const winPath = `C:\\Users\\Public\\piper_${Date.now()}.wav`;
+      const wslPath = `/mnt/c/Users/Public/piper_${Date.now()}.wav`;
+      await Bun.write(wslPath, wav);
+
+      return new Promise((resolve, reject) => {
+        const proc = spawn('powershell.exe', ['-NoProfile', '-Command',
+          `(New-Object Media.SoundPlayer '${winPath}').PlaySync(); Remove-Item '${winPath}'`]);
+        proc.on('error', reject);
+        proc.on('exit', (code) => {
+          spawn('/bin/rm', ['-f', tempFile]);
+          code === 0 ? resolve() : reject(new Error(`powershell exited ${code}`));
+        });
+      });
+    }
+
+    return new Promise((resolve, reject) => {
+      const proc = spawn('aplay', ['-q', tempFile]);
+      proc.on('error', reject);
+      proc.on('exit', (code) => {
+        spawn('/bin/rm', ['-f', tempFile]);
+        code === 0 ? resolve() : reject(new Error(`aplay exited ${code}`));
+      });
+    });
+  }
+}
diff --git a/.claude/voice-server/providers/index.ts b/.claude/voice-server/providers/index.ts
@@ -0,0 +1,9 @@
+export interface TTSProvider {
+  readonly name: string;
+  isAvailable(): boolean;
+  speak(text: string, voiceId?: string): Promise<void>;
+}
+
+export { ElevenLabs } from './ElevenLabs';
+export { Piper } from './Piper';
+export { MacOSSay } from './MacOSSay';
diff --git a/.claude/voice-server/server.ts b/.claude/voice-server/server.ts
@@ -7,7 +7,8 @@ import { serve } from "bun";
 import { spawn } from "child_process";
 import { homedir } from "os";
 import { join } from "path";
-import { existsSync } from "fs";
+import { existsSync, readFileSync } from "fs";
+import { ElevenLabs, Piper, MacOSSay, type TTSProvider } from "./providers";
 
 // Load .env from user home directory
 const envPath = join(homedir(), '.env');
@@ -24,9 +25,32 @@ if (existsSync(envPath)) {
 const PORT = parseInt(process.env.PORT || "8888");
 const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY;
 
-if (!ELEVENLABS_API_KEY) {
-  console.error('⚠️  ELEVENLABS_API_KEY not found in ~/.env');
-  console.error('Add: ELEVENLABS_API_KEY=your_key_here');
+// Load TTS provider from config
+const PROVIDERS: Record<string, () => TTSProvider> = {
+  'elevenlabs': () => new ElevenLabs(),
+  'piper': () => new Piper(),
+  'macos-say': () => new MacOSSay(),
+};
+
+let provider: TTSProvider | null = null;
+const configPath = join(import.meta.dir, 'config.json');
+if (existsSync(configPath)) {
+  const config = JSON.parse(readFileSync(configPath, 'utf-8'));
+  for (const name of config.providers || []) {
+    const create = PROVIDERS[name];
+    if (create) {
+      const p = create();
+      if (p.isAvailable()) {
+        provider = p;
+        break;
+      }
+    }
+  }
+}
+
+if (!provider && !ELEVENLABS_API_KEY) {
+  console.error('⚠️  No TTS provider available');
+  console.error('Configure providers in config.json or add ELEVENLABS_API_KEY to ~/.env');
 }
 
 // Default voice ID (Kai's voice)
@@ -174,14 +198,18 @@ async function sendNotification(
   const safeTitle = sanitizeForShell(title);
   const safeMessage = sanitizeForShell(message);
 
-  // Generate and play voice using ElevenLabs
-  if (voiceEnabled && ELEVENLABS_API_KEY) {
+  // Generate and play voice
+  if (voiceEnabled) {
     try {
       const voice = voiceId || DEFAULT_VOICE_ID;
-      console.log(`🎙️  Generating speech with ElevenLabs (voice: ${voice})`);
-
-      const audioBuffer = await generateSpeech(safeMessage, voice);
-      await playAudio(audioBuffer);
+      if (provider) {
+        console.log(`🎙️  Generating speech with ${provider.name} (voice: ${voice})`);
+        await provider.speak(safeMessage, voice);
+      } else if (ELEVENLABS_API_KEY) {
+        console.log(`🎙️  Generating speech with ElevenLabs (voice: ${voice})`);
+        const audioBuffer = await generateSpeech(safeMessage, voice);
+        await playAudio(audioBuffer);
+      }
     } catch (error) {
       console.error("Failed to generate/play speech:", error);
     }
@@ -315,9 +343,7 @@ const server = serve({
         JSON.stringify({
           status: "healthy",
           port: PORT,
-          voice_system: "ElevenLabs",
-          model: DEFAULT_MODEL,
-          default_voice_id: DEFAULT_VOICE_ID,
+          provider: provider?.name || "elevenlabs",
           api_key_configured: !!ELEVENLABS_API_KEY
         }),
         {
@@ -335,7 +361,5 @@ const server = serve({
 });
 
 console.log(`🚀 PAIVoice Server running on port ${PORT}`);
-console.log(`🎙️  Using ElevenLabs TTS (model: ${DEFAULT_MODEL}, voice: ${DEFAULT_VOICE_ID})`);
+console.log(`🎙️  TTS Provider: ${provider?.name || 'elevenlabs'}`);
 console.log(`📡 POST to http://localhost:${PORT}/notify`);
-console.log(`🔒 Security: CORS restricted to localhost, rate limiting enabled`);
-console.log(`🔑 API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`);

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "providers": ["piper", "elevenlabs", "macos-say"]`
	`3`	`+}`