refactor: address PR review comments

zitongcharliedeng · zitongcharliedeng · commit c9bbfe1b5c04 · 2025-12-09T18:32:49.000Z
- TTSProvider.synthesize() returns AudioResult, server calls playAudio()
- Provider registry moved to providers/index.ts with loadProvider()
- Piper uses native WAV output (removed pcmToWav)
- MacOSSay outputs to file then returns buffer
- Removed duplicate generateSpeech/playAudio from server.ts
- Clean separation: providers produce audio, audio.ts plays it
diff --git a/.claude/voice-server/providers/ElevenLabs.ts b/.claude/voice-server/providers/ElevenLabs.ts
@@ -1,5 +1,4 @@
-import type { TTSProvider } from '.';
-import { playAudio } from '../audio';
+import type { TTSProvider, AudioResult } from '.';
 
 export class ElevenLabs implements TTSProvider {
   readonly name = 'elevenlabs';
@@ -12,7 +11,7 @@ export class ElevenLabs implements TTSProvider {
     return !!this.apiKey;
   }
 
-  async speak(text: string, voiceId?: string): Promise<void> {
+  async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
     if (!this.apiKey) {
       throw new Error('ElevenLabs API key not configured');
     }
@@ -46,6 +45,6 @@ export class ElevenLabs implements TTSProvider {
     }
 
     const audioBuffer = await response.arrayBuffer();
-    await playAudio(Buffer.from(audioBuffer), 'mp3');
+    return { audio: Buffer.from(audioBuffer), format: 'mp3' };
   }
 }
diff --git a/.claude/voice-server/providers/MacOSSay.ts b/.claude/voice-server/providers/MacOSSay.ts
@@ -1,6 +1,7 @@
-import { spawn } from 'child_process';
+import { spawnSync } from 'child_process';
+import { readFileSync, unlinkSync } from 'fs';
 import { platform } from 'os';
-import type { TTSProvider } from './index';
+import type { TTSProvider, AudioResult } from '.';
 
 export class MacOSSay implements TTSProvider {
   readonly name = 'macos-say';
@@ -9,15 +10,19 @@ export class MacOSSay implements TTSProvider {
     return platform() === 'darwin';
   }
 
-  async speak(text: string, voiceId?: string): Promise<void> {
+  async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
     const voice = voiceId || 'Samantha';
+    const outputFile = `/tmp/say-${Date.now()}.aiff`;
 
-    return new Promise((resolve, reject) => {
-      const proc = spawn('/usr/bin/say', ['-v', voice, text]);
-      proc.on('error', reject);
-      proc.on('exit', (code) => {
-        code === 0 ? resolve() : reject(new Error(`say exited with code ${code}`));
-      });
-    });
+    const result = spawnSync('/usr/bin/say', ['-v', voice, '-o', outputFile, text]);
+
+    if (result.error) throw new Error(`say error: ${result.error.message}`);
+    if (result.status !== 0) throw new Error(`say exited with code ${result.status}`);
+
+    const audio = readFileSync(outputFile);
+    unlinkSync(outputFile);
+
+    // macOS say outputs AIFF, which afplay can handle
+    return { audio, format: 'wav' }; // AIFF is close enough to WAV for playback
   }
 }
diff --git a/.claude/voice-server/providers/Piper.ts b/.claude/voice-server/providers/Piper.ts
@@ -1,8 +1,7 @@
 import { spawnSync } from 'child_process';
-import { existsSync, readFileSync } from 'fs';
+import { existsSync, readFileSync, unlinkSync } from 'fs';
 import { join } from 'path';
-import type { TTSProvider } from '.';
-import { playAudio } from '../audio';
+import type { TTSProvider, AudioResult } from '.';
 
 interface VoiceConfig {
   model: string;
@@ -35,42 +34,30 @@ export class Piper implements TTSProvider {
     return existsSync(this.binary);
   }
 
-  async speak(text: string, voiceId?: string): Promise<void> {
+  async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
     const voice = this.voices[voiceId || 'default'] || { model: 'en_US-libritts_r-medium', speaker: 0 };
     const modelPath = join(this.modelsDir, `${voice.model}.onnx`);
 
     if (!existsSync(modelPath)) {
       throw new Error(`Piper model not found: ${modelPath}`);
     }
 
+    // Use Piper's native WAV output instead of manual PCM conversion
+    const outputFile = `/tmp/piper-${Date.now()}.wav`;
+
     const result = spawnSync(this.binary, [
       '--model', modelPath,
       '--speaker', voice.speaker.toString(),
-      '--output-raw'
+      '--output_file', outputFile,
+      '--quiet'
     ], { input: text, maxBuffer: 10 * 1024 * 1024 });
 
     if (result.error) throw new Error(`Piper error: ${result.error.message}`);
     if (result.status !== 0) throw new Error(`Piper failed: ${result.stderr?.toString()}`);
 
-    const wavBuffer = this.pcmToWav(result.stdout);
-    await playAudio(wavBuffer, 'wav');
-  }
+    const audio = readFileSync(outputFile);
+    unlinkSync(outputFile);
 
-  private pcmToWav(pcm: Buffer): Buffer {
-    const header = Buffer.alloc(44);
-    header.write('RIFF', 0);
-    header.writeUInt32LE(36 + pcm.length, 4);
-    header.write('WAVE', 8);
-    header.write('fmt ', 12);
-    header.writeUInt32LE(16, 16);
-    header.writeUInt16LE(1, 20);
-    header.writeUInt16LE(1, 22);
-    header.writeUInt32LE(22050, 24);
-    header.writeUInt32LE(44100, 28);
-    header.writeUInt16LE(2, 32);
-    header.writeUInt16LE(16, 34);
-    header.write('data', 36);
-    header.writeUInt32LE(pcm.length, 40);
-    return Buffer.concat([header, pcm]);
+    return { audio, format: 'wav' };
   }
 }
diff --git a/.claude/voice-server/providers/index.ts b/.claude/voice-server/providers/index.ts
@@ -1,9 +1,59 @@
+import { existsSync, readFileSync } from 'fs';
+import { join } from 'path';
+
+export type AudioFormat = 'mp3' | 'wav';
+
+export interface AudioResult {
+  audio: Buffer;
+  format: AudioFormat;
+}
+
 export interface TTSProvider {
   readonly name: string;
   isAvailable(): boolean;
-  speak(text: string, voiceId?: string): Promise<void>;
+  synthesize(text: string, voiceId?: string): Promise<AudioResult>;
 }
 
 export { ElevenLabs } from './ElevenLabs';
 export { Piper } from './Piper';
 export { MacOSSay } from './MacOSSay';
+
+// Provider registry - maps config names to provider constructors
+const providerConstructors: Record<string, () => TTSProvider> = {
+  'elevenlabs': () => new (require('./ElevenLabs').ElevenLabs)(),
+  'piper': () => new (require('./Piper').Piper)(),
+  'macos-say': () => new (require('./MacOSSay').MacOSSay)(),
+};
+
+/**
+ * Load the first available TTS provider based on config.json order.
+ * Falls back to ElevenLabs if no config found.
+ */
+export function loadProvider(configDir: string): TTSProvider | null {
+  const configPath = join(configDir, 'config.json');
+
+  let providerOrder = ['elevenlabs', 'piper', 'macos-say'];
+
+  if (existsSync(configPath)) {
+    try {
+      const config = JSON.parse(readFileSync(configPath, 'utf-8'));
+      if (Array.isArray(config.providers)) {
+        providerOrder = config.providers;
+      }
+    } catch {
+      // Use default order on parse error
+    }
+  }
+
+  for (const name of providerOrder) {
+    const constructor = providerConstructors[name];
+    if (constructor) {
+      const provider = constructor();
+      if (provider.isAvailable()) {
+        return provider;
+      }
+    }
+  }
+
+  return null;
+}
diff --git a/.claude/voice-server/server.ts b/.claude/voice-server/server.ts
@@ -8,7 +8,8 @@ import { spawn } from "child_process";
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, readFileSync } from "fs";
-import { ElevenLabs, Piper, MacOSSay, type TTSProvider } from "./providers";
+import { loadProvider } from "./providers";
+import { playAudio } from "./audio";
 
 // Load .env from user home directory
 const envPath = join(homedir(), '.env');
@@ -25,28 +26,8 @@ if (existsSync(envPath)) {
 const PORT = parseInt(process.env.PORT || "8888");
 const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY;
 
-// Load TTS provider from config
-const PROVIDERS: Record<string, () => TTSProvider> = {
-  'elevenlabs': () => new ElevenLabs(),
-  'piper': () => new Piper(),
-  'macos-say': () => new MacOSSay(),
-};
-
-let provider: TTSProvider | null = null;
-const configPath = join(import.meta.dir, 'config.json');
-if (existsSync(configPath)) {
-  const config = JSON.parse(readFileSync(configPath, 'utf-8'));
-  for (const name of config.providers || []) {
-    const create = PROVIDERS[name];
-    if (create) {
-      const p = create();
-      if (p.isAvailable()) {
-        provider = p;
-        break;
-      }
-    }
-  }
-}
+// Load TTS provider based on config.json order
+const provider = loadProvider(import.meta.dir);
 
 if (!provider && !ELEVENLABS_API_KEY) {
   console.error('⚠️  No TTS provider available');
@@ -56,10 +37,6 @@ if (!provider && !ELEVENLABS_API_KEY) {
 // Default voice ID (Kai's voice)
 const DEFAULT_VOICE_ID = process.env.ELEVENLABS_VOICE_ID || "s3TPKV1kjDlVtZbl4Ksh";
 
-// Default model - eleven_multilingual_v2 is the current recommended model
-// See: https://elevenlabs.io/docs/models#models-overview
-const DEFAULT_MODEL = process.env.ELEVENLABS_MODEL || "eleven_multilingual_v2";
-
 // Sanitize input for shell commands
 function sanitizeForShell(input: string): string {
   return input.replace(/[^a-zA-Z0-9\s.,!?\-']/g, '').trim().substring(0, 500);
@@ -90,71 +67,6 @@ function validateInput(input: any): { valid: boolean; error?: string } {
   return { valid: true };
 }
 
-// Generate speech using ElevenLabs API
-async function generateSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
-  if (!ELEVENLABS_API_KEY) {
-    throw new Error('ElevenLabs API key not configured');
-  }
-
-  const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
-
-  const response = await fetch(url, {
-    method: 'POST',
-    headers: {
-      'Accept': 'audio/mpeg',
-      'Content-Type': 'application/json',
-      'xi-api-key': ELEVENLABS_API_KEY,
-    },
-    body: JSON.stringify({
-      text: text,
-      model_id: DEFAULT_MODEL,
-      voice_settings: {
-        stability: 0.5,
-        similarity_boost: 0.5,
-      },
-    }),
-  });
-
-  if (!response.ok) {
-    const errorText = await response.text();
-    // Check for model-related errors
-    if (errorText.includes('model') || response.status === 422) {
-      throw new Error(`ElevenLabs API error: Invalid model "${DEFAULT_MODEL}". Update ELEVENLABS_MODEL in ~/.env. See https://elevenlabs.io/docs/models`);
-    }
-    throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
-  }
-
-  return await response.arrayBuffer();
-}
-
-// Play audio using afplay (macOS)
-async function playAudio(audioBuffer: ArrayBuffer): Promise<void> {
-  const tempFile = `/tmp/voice-${Date.now()}.mp3`;
-
-  // Write audio to temp file
-  await Bun.write(tempFile, audioBuffer);
-
-  return new Promise((resolve, reject) => {
-    const proc = spawn('/usr/bin/afplay', [tempFile]);
-
-    proc.on('error', (error) => {
-      console.error('Error playing audio:', error);
-      reject(error);
-    });
-
-    proc.on('exit', (code) => {
-      // Clean up temp file
-      spawn('/bin/rm', [tempFile]);
-
-      if (code === 0) {
-        resolve();
-      } else {
-        reject(new Error(`afplay exited with code ${code}`));
-      }
-    });
-  });
-}
-
 // Spawn a process safely
 function spawnSafe(command: string, args: string[]): Promise<void> {
   return new Promise((resolve, reject) => {
@@ -199,17 +111,12 @@ async function sendNotification(
   const safeMessage = sanitizeForShell(message);
 
   // Generate and play voice
-  if (voiceEnabled) {
+  if (voiceEnabled && provider) {
     try {
       const voice = voiceId || DEFAULT_VOICE_ID;
-      if (provider) {
-        console.log(`🎙️  Generating speech with ${provider.name} (voice: ${voice})`);
-        await provider.speak(safeMessage, voice);
-      } else if (ELEVENLABS_API_KEY) {
-        console.log(`🎙️  Generating speech with ElevenLabs (voice: ${voice})`);
-        const audioBuffer = await generateSpeech(safeMessage, voice);
-        await playAudio(audioBuffer);
-      }
+      console.log(`🎙️  Generating speech with ${provider.name} (voice: ${voice})`);
+      const result = await provider.synthesize(safeMessage, voice);
+      await playAudio(result.audio, result.format);
     } catch (error) {
       console.error("Failed to generate/play speech:", error);
     }

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`		`-import type { TTSProvider } from '.';`
`2`		`-import { playAudio } from '../audio';`
	`1`	`+import type { TTSProvider, AudioResult } from '.';`
`3`	`2`
`4`	`3`	`export class ElevenLabs implements TTSProvider {`
`5`	`4`	`readonly name = 'elevenlabs';`
`@@ -12,7 +11,7 @@ export class ElevenLabs implements TTSProvider {`
`12`	`11`	`return !!this.apiKey;`
`13`	`12`	`}`
`14`	`13`
`15`		`- async speak(text: string, voiceId?: string): Promise<void> {`
	`14`	`+ async synthesize(text: string, voiceId?: string): Promise<AudioResult> {`
`16`	`15`	`if (!this.apiKey) {`
`17`	`16`	`throw new Error('ElevenLabs API key not configured');`
`18`	`17`	`}`
`@@ -46,6 +45,6 @@ export class ElevenLabs implements TTSProvider {`
`46`	`45`	`}`
`47`	`46`
`48`	`47`	`const audioBuffer = await response.arrayBuffer();`
`49`		`- await playAudio(Buffer.from(audioBuffer), 'mp3');`
	`48`	`+ return { audio: Buffer.from(audioBuffer), format: 'mp3' };`
`50`	`49`	`}`
`51`	`50`	`}`