Skip to content

Commit c9bbfe1

Browse files
refactor: address PR review comments
- TTSProvider.synthesize() returns AudioResult, server calls playAudio() - Provider registry moved to providers/index.ts with loadProvider() - Piper uses native WAV output (removed pcmToWav) - MacOSSay outputs to file then returns buffer - Removed duplicate generateSpeech/playAudio from server.ts - Clean separation: providers produce audio, audio.ts plays it
1 parent 8bb7dd1 commit c9bbfe1

File tree

5 files changed

+88
-140
lines changed

5 files changed

+88
-140
lines changed

.claude/voice-server/providers/ElevenLabs.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import type { TTSProvider } from '.';
2-
import { playAudio } from '../audio';
1+
import type { TTSProvider, AudioResult } from '.';
32

43
export class ElevenLabs implements TTSProvider {
54
readonly name = 'elevenlabs';
@@ -12,7 +11,7 @@ export class ElevenLabs implements TTSProvider {
1211
return !!this.apiKey;
1312
}
1413

15-
async speak(text: string, voiceId?: string): Promise<void> {
14+
async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
1615
if (!this.apiKey) {
1716
throw new Error('ElevenLabs API key not configured');
1817
}
@@ -46,6 +45,6 @@ export class ElevenLabs implements TTSProvider {
4645
}
4746

4847
const audioBuffer = await response.arrayBuffer();
49-
await playAudio(Buffer.from(audioBuffer), 'mp3');
48+
return { audio: Buffer.from(audioBuffer), format: 'mp3' };
5049
}
5150
}
Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import { spawn } from 'child_process';
1+
import { spawnSync } from 'child_process';
2+
import { readFileSync, unlinkSync } from 'fs';
23
import { platform } from 'os';
3-
import type { TTSProvider } from './index';
4+
import type { TTSProvider, AudioResult } from '.';
45

56
export class MacOSSay implements TTSProvider {
67
readonly name = 'macos-say';
@@ -9,15 +10,19 @@ export class MacOSSay implements TTSProvider {
910
return platform() === 'darwin';
1011
}
1112

12-
async speak(text: string, voiceId?: string): Promise<void> {
13+
async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
1314
const voice = voiceId || 'Samantha';
15+
const outputFile = `/tmp/say-${Date.now()}.aiff`;
1416

15-
return new Promise((resolve, reject) => {
16-
const proc = spawn('/usr/bin/say', ['-v', voice, text]);
17-
proc.on('error', reject);
18-
proc.on('exit', (code) => {
19-
code === 0 ? resolve() : reject(new Error(`say exited with code ${code}`));
20-
});
21-
});
17+
const result = spawnSync('/usr/bin/say', ['-v', voice, '-o', outputFile, text]);
18+
19+
if (result.error) throw new Error(`say error: ${result.error.message}`);
20+
if (result.status !== 0) throw new Error(`say exited with code ${result.status}`);
21+
22+
const audio = readFileSync(outputFile);
23+
unlinkSync(outputFile);
24+
25+
// macOS say outputs AIFF, which afplay can handle
26+
return { audio, format: 'wav' }; // AIFF is close enough to WAV for playback
2227
}
2328
}

.claude/voice-server/providers/Piper.ts

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import { spawnSync } from 'child_process';
2-
import { existsSync, readFileSync } from 'fs';
2+
import { existsSync, readFileSync, unlinkSync } from 'fs';
33
import { join } from 'path';
4-
import type { TTSProvider } from '.';
5-
import { playAudio } from '../audio';
4+
import type { TTSProvider, AudioResult } from '.';
65

76
interface VoiceConfig {
87
model: string;
@@ -35,42 +34,30 @@ export class Piper implements TTSProvider {
3534
return existsSync(this.binary);
3635
}
3736

38-
async speak(text: string, voiceId?: string): Promise<void> {
37+
async synthesize(text: string, voiceId?: string): Promise<AudioResult> {
3938
const voice = this.voices[voiceId || 'default'] || { model: 'en_US-libritts_r-medium', speaker: 0 };
4039
const modelPath = join(this.modelsDir, `${voice.model}.onnx`);
4140

4241
if (!existsSync(modelPath)) {
4342
throw new Error(`Piper model not found: ${modelPath}`);
4443
}
4544

45+
// Use Piper's native WAV output instead of manual PCM conversion
46+
const outputFile = `/tmp/piper-${Date.now()}.wav`;
47+
4648
const result = spawnSync(this.binary, [
4749
'--model', modelPath,
4850
'--speaker', voice.speaker.toString(),
49-
'--output-raw'
51+
'--output_file', outputFile,
52+
'--quiet'
5053
], { input: text, maxBuffer: 10 * 1024 * 1024 });
5154

5255
if (result.error) throw new Error(`Piper error: ${result.error.message}`);
5356
if (result.status !== 0) throw new Error(`Piper failed: ${result.stderr?.toString()}`);
5457

55-
const wavBuffer = this.pcmToWav(result.stdout);
56-
await playAudio(wavBuffer, 'wav');
57-
}
58+
const audio = readFileSync(outputFile);
59+
unlinkSync(outputFile);
5860

59-
private pcmToWav(pcm: Buffer): Buffer {
60-
const header = Buffer.alloc(44);
61-
header.write('RIFF', 0);
62-
header.writeUInt32LE(36 + pcm.length, 4);
63-
header.write('WAVE', 8);
64-
header.write('fmt ', 12);
65-
header.writeUInt32LE(16, 16);
66-
header.writeUInt16LE(1, 20);
67-
header.writeUInt16LE(1, 22);
68-
header.writeUInt32LE(22050, 24);
69-
header.writeUInt32LE(44100, 28);
70-
header.writeUInt16LE(2, 32);
71-
header.writeUInt16LE(16, 34);
72-
header.write('data', 36);
73-
header.writeUInt32LE(pcm.length, 40);
74-
return Buffer.concat([header, pcm]);
61+
return { audio, format: 'wav' };
7562
}
7663
}
Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,59 @@
1+
import { existsSync, readFileSync } from 'fs';
2+
import { join } from 'path';
3+
4+
export type AudioFormat = 'mp3' | 'wav';
5+
6+
export interface AudioResult {
7+
audio: Buffer;
8+
format: AudioFormat;
9+
}
10+
111
export interface TTSProvider {
212
readonly name: string;
313
isAvailable(): boolean;
4-
speak(text: string, voiceId?: string): Promise<void>;
14+
synthesize(text: string, voiceId?: string): Promise<AudioResult>;
515
}
616

717
export { ElevenLabs } from './ElevenLabs';
818
export { Piper } from './Piper';
919
export { MacOSSay } from './MacOSSay';
20+
21+
// Provider registry - maps config names to provider constructors
22+
const providerConstructors: Record<string, () => TTSProvider> = {
23+
'elevenlabs': () => new (require('./ElevenLabs').ElevenLabs)(),
24+
'piper': () => new (require('./Piper').Piper)(),
25+
'macos-say': () => new (require('./MacOSSay').MacOSSay)(),
26+
};
27+
28+
/**
29+
* Load the first available TTS provider based on config.json order.
30+
* Falls back to ElevenLabs if no config found.
31+
*/
32+
export function loadProvider(configDir: string): TTSProvider | null {
33+
const configPath = join(configDir, 'config.json');
34+
35+
let providerOrder = ['elevenlabs', 'piper', 'macos-say'];
36+
37+
if (existsSync(configPath)) {
38+
try {
39+
const config = JSON.parse(readFileSync(configPath, 'utf-8'));
40+
if (Array.isArray(config.providers)) {
41+
providerOrder = config.providers;
42+
}
43+
} catch {
44+
// Use default order on parse error
45+
}
46+
}
47+
48+
for (const name of providerOrder) {
49+
const constructor = providerConstructors[name];
50+
if (constructor) {
51+
const provider = constructor();
52+
if (provider.isAvailable()) {
53+
return provider;
54+
}
55+
}
56+
}
57+
58+
return null;
59+
}

.claude/voice-server/server.ts

Lines changed: 8 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ import { spawn } from "child_process";
88
import { homedir } from "os";
99
import { join } from "path";
1010
import { existsSync, readFileSync } from "fs";
11-
import { ElevenLabs, Piper, MacOSSay, type TTSProvider } from "./providers";
11+
import { loadProvider } from "./providers";
12+
import { playAudio } from "./audio";
1213

1314
// Load .env from user home directory
1415
const envPath = join(homedir(), '.env');
@@ -25,28 +26,8 @@ if (existsSync(envPath)) {
2526
const PORT = parseInt(process.env.PORT || "8888");
2627
const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY;
2728

28-
// Load TTS provider from config
29-
const PROVIDERS: Record<string, () => TTSProvider> = {
30-
'elevenlabs': () => new ElevenLabs(),
31-
'piper': () => new Piper(),
32-
'macos-say': () => new MacOSSay(),
33-
};
34-
35-
let provider: TTSProvider | null = null;
36-
const configPath = join(import.meta.dir, 'config.json');
37-
if (existsSync(configPath)) {
38-
const config = JSON.parse(readFileSync(configPath, 'utf-8'));
39-
for (const name of config.providers || []) {
40-
const create = PROVIDERS[name];
41-
if (create) {
42-
const p = create();
43-
if (p.isAvailable()) {
44-
provider = p;
45-
break;
46-
}
47-
}
48-
}
49-
}
29+
// Load TTS provider based on config.json order
30+
const provider = loadProvider(import.meta.dir);
5031

5132
if (!provider && !ELEVENLABS_API_KEY) {
5233
console.error('⚠️ No TTS provider available');
@@ -56,10 +37,6 @@ if (!provider && !ELEVENLABS_API_KEY) {
5637
// Default voice ID (Kai's voice)
5738
const DEFAULT_VOICE_ID = process.env.ELEVENLABS_VOICE_ID || "s3TPKV1kjDlVtZbl4Ksh";
5839

59-
// Default model - eleven_multilingual_v2 is the current recommended model
60-
// See: https://elevenlabs.io/docs/models#models-overview
61-
const DEFAULT_MODEL = process.env.ELEVENLABS_MODEL || "eleven_multilingual_v2";
62-
6340
// Sanitize input for shell commands
6441
function sanitizeForShell(input: string): string {
6542
return input.replace(/[^a-zA-Z0-9\s.,!?\-']/g, '').trim().substring(0, 500);
@@ -90,71 +67,6 @@ function validateInput(input: any): { valid: boolean; error?: string } {
9067
return { valid: true };
9168
}
9269

93-
// Generate speech using ElevenLabs API
94-
async function generateSpeech(text: string, voiceId: string): Promise<ArrayBuffer> {
95-
if (!ELEVENLABS_API_KEY) {
96-
throw new Error('ElevenLabs API key not configured');
97-
}
98-
99-
const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;
100-
101-
const response = await fetch(url, {
102-
method: 'POST',
103-
headers: {
104-
'Accept': 'audio/mpeg',
105-
'Content-Type': 'application/json',
106-
'xi-api-key': ELEVENLABS_API_KEY,
107-
},
108-
body: JSON.stringify({
109-
text: text,
110-
model_id: DEFAULT_MODEL,
111-
voice_settings: {
112-
stability: 0.5,
113-
similarity_boost: 0.5,
114-
},
115-
}),
116-
});
117-
118-
if (!response.ok) {
119-
const errorText = await response.text();
120-
// Check for model-related errors
121-
if (errorText.includes('model') || response.status === 422) {
122-
throw new Error(`ElevenLabs API error: Invalid model "${DEFAULT_MODEL}". Update ELEVENLABS_MODEL in ~/.env. See https://elevenlabs.io/docs/models`);
123-
}
124-
throw new Error(`ElevenLabs API error: ${response.status} - ${errorText}`);
125-
}
126-
127-
return await response.arrayBuffer();
128-
}
129-
130-
// Play audio using afplay (macOS)
131-
async function playAudio(audioBuffer: ArrayBuffer): Promise<void> {
132-
const tempFile = `/tmp/voice-${Date.now()}.mp3`;
133-
134-
// Write audio to temp file
135-
await Bun.write(tempFile, audioBuffer);
136-
137-
return new Promise((resolve, reject) => {
138-
const proc = spawn('/usr/bin/afplay', [tempFile]);
139-
140-
proc.on('error', (error) => {
141-
console.error('Error playing audio:', error);
142-
reject(error);
143-
});
144-
145-
proc.on('exit', (code) => {
146-
// Clean up temp file
147-
spawn('/bin/rm', [tempFile]);
148-
149-
if (code === 0) {
150-
resolve();
151-
} else {
152-
reject(new Error(`afplay exited with code ${code}`));
153-
}
154-
});
155-
});
156-
}
157-
15870
// Spawn a process safely
15971
function spawnSafe(command: string, args: string[]): Promise<void> {
16072
return new Promise((resolve, reject) => {
@@ -199,17 +111,12 @@ async function sendNotification(
199111
const safeMessage = sanitizeForShell(message);
200112

201113
// Generate and play voice
202-
if (voiceEnabled) {
114+
if (voiceEnabled && provider) {
203115
try {
204116
const voice = voiceId || DEFAULT_VOICE_ID;
205-
if (provider) {
206-
console.log(`🎙️ Generating speech with ${provider.name} (voice: ${voice})`);
207-
await provider.speak(safeMessage, voice);
208-
} else if (ELEVENLABS_API_KEY) {
209-
console.log(`🎙️ Generating speech with ElevenLabs (voice: ${voice})`);
210-
const audioBuffer = await generateSpeech(safeMessage, voice);
211-
await playAudio(audioBuffer);
212-
}
117+
console.log(`🎙️ Generating speech with ${provider.name} (voice: ${voice})`);
118+
const result = await provider.synthesize(safeMessage, voice);
119+
await playAudio(result.audio, result.format);
213120
} catch (error) {
214121
console.error("Failed to generate/play speech:", error);
215122
}

0 commit comments

Comments
 (0)