diff --git a/apps/desktop/package.json b/apps/desktop/package.json index ed30b52b4..28e9a406a 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -45,6 +45,7 @@ "@ai-sdk/openai": "^3.0.1", "@egoist/electron-panel-window": "^8.0.3", "@fastify/cors": "^11.1.0", + "@fastify/multipart": "^9.3.0", "@modelcontextprotocol/sdk": "^1.24.3", "@radix-ui/react-dropdown-menu": "^2.1.16", "@radix-ui/react-scroll-area": "^1.2.9", diff --git a/apps/desktop/src/main/remote-server.ts b/apps/desktop/src/main/remote-server.ts index d7327cd4a..3c3d9696d 100644 --- a/apps/desktop/src/main/remote-server.ts +++ b/apps/desktop/src/main/remote-server.ts @@ -1,9 +1,10 @@ import Fastify, { FastifyInstance } from "fastify" import cors from "@fastify/cors" +import multipart from "@fastify/multipart" import crypto from "crypto" import fs from "fs" import path from "path" -import { configStore, recordingsFolder } from "./config" +import { configStore, recordingsFolder, Config } from "./config" import { diagnosticsService } from "./diagnostics" import { mcpService, MCPToolResult } from "./mcp-service" import { processTranscriptWithAgentMode } from "./llm" @@ -13,6 +14,7 @@ import { AgentProgressUpdate, SessionProfileSnapshot } from "../shared/types" import { agentSessionTracker } from "./agent-session-tracker" import { emergencyStopAll } from "./emergency-stop" import { profileService } from "./profile-service" +import { preprocessTextForTTS, validateTTSText } from "@speakmcp/shared" let server: FastifyInstance | null = null let lastError: string | undefined @@ -96,6 +98,241 @@ function extractUserPrompt(body: any): string | null { } } +// ============================================ +// Audio Processing Helper Functions +// ============================================ + +/** + * Transcribe audio using configured STT provider (OpenAI or Groq Whisper) + */ +async function transcribeAudio( + audioBuffer: Buffer, + filename: string, + options?: { + model?: string + language?: string + prompt?: string + response_format?: string + } +): Promise<{ text: string }> { + const config = configStore.get() + + // Determine mime type from filename + const ext = path.extname(filename).toLowerCase() + const mimeTypes: Record = { + ".mp3": "audio/mpeg", + ".mp4": "audio/mp4", + ".m4a": "audio/mp4", + ".wav": "audio/wav", + ".webm": "audio/webm", + ".ogg": "audio/ogg", + ".flac": "audio/flac", + } + const mimeType = mimeTypes[ext] || "audio/webm" + + const form = new FormData() + form.append("file", new File([audioBuffer], filename, { type: mimeType })) + + // Use specified model or default based on provider + const model = options?.model || + (config.sttProviderId === "groq" ? "whisper-large-v3-turbo" : "whisper-1") + form.append("model", model) + form.append("response_format", options?.response_format || "json") + + // Add prompt if provided (for Groq) + if (options?.prompt?.trim()) { + form.append("prompt", options.prompt.trim()) + } else if (config.sttProviderId === "groq" && config.groqSttPrompt?.trim()) { + form.append("prompt", config.groqSttPrompt.trim()) + } + + // Add language if specified + const languageCode = options?.language || + (config.sttProviderId === "groq" + ? config.groqSttLanguage || config.sttLanguage + : config.openaiSttLanguage || config.sttLanguage) + + if (languageCode && languageCode !== "auto") { + form.append("language", languageCode) + } + + const groqBaseUrl = config.groqBaseUrl || "https://api.groq.com/openai/v1" + const openaiBaseUrl = config.openaiBaseUrl || "https://api.openai.com/v1" + + const transcriptResponse = await fetch( + config.sttProviderId === "groq" + ? `${groqBaseUrl}/audio/transcriptions` + : `${openaiBaseUrl}/audio/transcriptions`, + { + method: "POST", + headers: { + Authorization: `Bearer ${config.sttProviderId === "groq" ? config.groqApiKey : config.openaiApiKey}`, + }, + body: form, + } + ) + + if (!transcriptResponse.ok) { + const errorText = await transcriptResponse.text() + throw new Error(`Transcription failed: ${transcriptResponse.statusText} - ${errorText.slice(0, 300)}`) + } + + const result = await transcriptResponse.json() + return { text: result.text || "" } +} + +/** + * Generate speech audio from text using configured TTS provider + */ +async function generateSpeechAudio( + text: string, + options?: { + model?: string + voice?: string + speed?: number + response_format?: string + } +): Promise<{ audio: ArrayBuffer; contentType: string }> { + const config = configStore.get() + const providerId = config.ttsProviderId || "openai" + + // Preprocess text for TTS + let processedText = text + if (config.ttsPreprocessingEnabled !== false) { + const preprocessingOptions = { + removeCodeBlocks: config.ttsRemoveCodeBlocks ?? true, + removeUrls: config.ttsRemoveUrls ?? true, + convertMarkdown: config.ttsConvertMarkdown ?? true, + } + processedText = preprocessTextForTTS(text, preprocessingOptions) + } + + // Validate processed text + const validation = validateTTSText(processedText) + if (!validation.isValid) { + throw new Error(`TTS validation failed: ${validation.issues.join(", ")}`) + } + + let audioBuffer: ArrayBuffer + let contentType: string + + if (providerId === "openai") { + const model = options?.model || config.openaiTtsModel || "tts-1" + const voice = options?.voice || config.openaiTtsVoice || "alloy" + const speed = options?.speed || config.openaiTtsSpeed || 1.0 + const responseFormat = options?.response_format || config.openaiTtsResponseFormat || "mp3" + + const baseUrl = config.openaiBaseUrl || "https://api.openai.com/v1" + const apiKey = config.openaiApiKey + + if (!apiKey) { + throw new Error("OpenAI API key is required for TTS") + } + + const response = await fetch(`${baseUrl}/audio/speech`, { + method: "POST", + headers: { + "Authorization": `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ model, input: processedText, voice, speed, response_format: responseFormat }), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`OpenAI TTS API error: ${response.statusText} - ${errorText}`) + } + + audioBuffer = await response.arrayBuffer() + contentType = responseFormat === "opus" ? "audio/opus" : + responseFormat === "aac" ? "audio/aac" : + responseFormat === "flac" ? "audio/flac" : + responseFormat === "wav" ? "audio/wav" : + responseFormat === "pcm" ? "audio/pcm" : "audio/mpeg" + } else if (providerId === "groq") { + const model = options?.model || config.groqTtsModel || "canopylabs/orpheus-v1-english" + const defaultVoice = model === "canopylabs/orpheus-arabic-saudi" ? "fahad" : "troy" + const voice = options?.voice || config.groqTtsVoice || defaultVoice + + const baseUrl = config.groqBaseUrl || "https://api.groq.com/openai/v1" + const apiKey = config.groqApiKey + + if (!apiKey) { + throw new Error("Groq API key is required for TTS") + } + + const response = await fetch(`${baseUrl}/audio/speech`, { + method: "POST", + headers: { + "Authorization": `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ model, input: processedText, voice, response_format: "wav" }), + }) + + if (!response.ok) { + const errorText = await response.text() + if (errorText.includes("requires terms acceptance")) { + const modelParam = model === "canopylabs/orpheus-arabic-saudi" + ? "canopylabs%2Forpheus-arabic-saudi" + : "canopylabs%2Forpheus-v1-english" + throw new Error(`Groq TTS requires terms acceptance. Visit https://console.groq.com/playground?model=${modelParam}`) + } + throw new Error(`Groq TTS API error: ${response.statusText} - ${errorText}`) + } + + audioBuffer = await response.arrayBuffer() + contentType = "audio/wav" + } else if (providerId === "gemini") { + const model = options?.model || config.geminiTtsModel || "gemini-2.5-flash-preview-tts" + const voice = options?.voice || config.geminiTtsVoice || "Kore" + + const baseUrl = config.geminiBaseUrl || "https://generativelanguage.googleapis.com" + const apiKey = config.geminiApiKey + + if (!apiKey) { + throw new Error("Gemini API key is required for TTS") + } + + const response = await fetch(`${baseUrl}/v1beta/models/${model}:generateContent?key=${apiKey}`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + contents: [{ parts: [{ text: processedText }] }], + generationConfig: { + responseModalities: ["AUDIO"], + speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } } + } + }), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Gemini TTS API error: ${response.statusText} - ${errorText}`) + } + + const result = await response.json() + const audioData = result.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data + + if (!audioData) { + throw new Error("No audio data received from Gemini TTS API") + } + + // Convert base64 to ArrayBuffer + const binaryString = atob(audioData) + const bytes = new Uint8Array(binaryString.length) + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i) + } + audioBuffer = bytes.buffer + contentType = "audio/wav" + } else { + throw new Error(`Unsupported TTS provider: ${providerId}`) + } + + return { audio: audioBuffer, contentType } +} + interface RunAgentOptions { prompt: string conversationId?: string @@ -402,6 +639,14 @@ export async function startRemoteServer() { strictPreflight: false, // Don't be strict about preflight requests }) + // Configure multipart for audio file uploads + await fastify.register(multipart, { + limits: { + fileSize: 25 * 1024 * 1024, // 25MB max (OpenAI limit) + files: 1, + }, + }) + // Auth hook (skip for OPTIONS preflight requests) fastify.addHook("onRequest", async (req, reply) => { // Skip auth for OPTIONS requests (CORS preflight) @@ -549,6 +794,246 @@ export async function startRemoteServer() { } }) + // ============================================ + // Audio Endpoints (OpenAI-compatible) + // ============================================ + + // POST /v1/audio/transcriptions - Transcribe audio to text (OpenAI-compatible) + fastify.post("/v1/audio/transcriptions", async (req, reply) => { + try { + const data = await req.file() + if (!data) { + return reply.code(400).send({ error: "No audio file provided" }) + } + + const audioBuffer = await data.toBuffer() + const filename = data.filename || "audio.webm" + + // Extract optional parameters from multipart fields + const fields = data.fields as Record + const model = fields?.model?.value + const language = fields?.language?.value + const prompt = fields?.prompt?.value + const responseFormat = fields?.response_format?.value || "json" + + diagnosticsService.logInfo("remote-server", `Transcribing audio: ${filename} (${audioBuffer.length} bytes)`) + + const result = await transcribeAudio(audioBuffer, filename, { + model, + language, + prompt, + response_format: responseFormat, + }) + + diagnosticsService.logInfo("remote-server", `Transcription complete: ${result.text.length} chars`) + + // Return in OpenAI-compatible format + if (responseFormat === "text") { + return reply.type("text/plain").send(result.text) + } else if (responseFormat === "verbose_json") { + return reply.send({ + task: "transcribe", + language: language || "en", + duration: 0, // We don't have this info + text: result.text, + }) + } + // Default JSON format + return reply.send({ text: result.text }) + } catch (error: any) { + diagnosticsService.logError("remote-server", "Transcription failed", error) + return reply.code(500).send({ error: error?.message || "Transcription failed" }) + } + }) + + // POST /v1/audio/speech - Generate speech from text (OpenAI-compatible) + fastify.post("/v1/audio/speech", async (req, reply) => { + try { + const body = req.body as any + const input = body?.input + + if (!input || typeof input !== "string") { + return reply.code(400).send({ error: "Missing or invalid 'input' text" }) + } + + const model = body?.model + const voice = body?.voice + const speed = body?.speed + const responseFormat = body?.response_format + + diagnosticsService.logInfo("remote-server", `Generating speech: ${input.length} chars`) + + const result = await generateSpeechAudio(input, { + model, + voice, + speed, + response_format: responseFormat, + }) + + diagnosticsService.logInfo("remote-server", `Speech generated: ${result.audio.byteLength} bytes`) + + // Return audio binary directly + return reply + .type(result.contentType) + .send(Buffer.from(result.audio)) + } catch (error: any) { + diagnosticsService.logError("remote-server", "Speech generation failed", error) + return reply.code(500).send({ error: error?.message || "Speech generation failed" }) + } + }) + + // POST /v1/audio/chat - Combined: transcribe audio, run agent, return text + optional audio + // This is a custom endpoint that combines STT -> Agent -> TTS in one call + fastify.post("/v1/audio/chat", async (req, reply) => { + try { + const data = await req.file() + if (!data) { + return reply.code(400).send({ error: "No audio file provided" }) + } + + const audioBuffer = await data.toBuffer() + const filename = data.filename || "audio.webm" + + // Extract optional parameters from multipart fields + const fields = data.fields as Record + const conversationId = fields?.conversation_id?.value + const returnAudio = fields?.return_audio?.value === "true" || fields?.return_audio?.value === true + const sttModel = fields?.stt_model?.value + const sttLanguage = fields?.language?.value + const ttsModel = fields?.tts_model?.value + const ttsVoice = fields?.voice?.value + const isStreaming = fields?.stream?.value === "true" || fields?.stream?.value === true + + diagnosticsService.logInfo("remote-server", `Audio chat: ${filename} (${audioBuffer.length} bytes), returnAudio=${returnAudio}`) + + // Step 1: Transcribe audio + const transcription = await transcribeAudio(audioBuffer, filename, { + model: sttModel, + language: sttLanguage, + }) + + if (!transcription.text.trim()) { + return reply.code(400).send({ error: "Could not transcribe audio - no speech detected" }) + } + + diagnosticsService.logInfo("remote-server", `Transcribed: "${transcription.text.substring(0, 100)}..."`) + + // Step 2: Run agent with transcribed text + if (isStreaming) { + // SSE streaming mode + const requestOrigin = req.headers.origin || "*" + reply.raw.writeHead(200, { + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Access-Control-Allow-Origin": requestOrigin, + "Access-Control-Allow-Credentials": "true", + }) + + const writeSSE = (eventData: object) => { + reply.raw.write(`data: ${JSON.stringify(eventData)}\n\n`) + } + + // Send transcription event + writeSSE({ type: "transcription", data: { text: transcription.text } }) + + const onProgress = (update: AgentProgressUpdate) => { + writeSSE({ type: "progress", data: update }) + } + + try { + const agentResult = await runAgent({ + prompt: transcription.text, + conversationId, + onProgress, + }) + + recordHistory(agentResult.content) + const model = resolveActiveModelId(configStore.get()) + + // Optionally generate audio response + let audioBase64: string | undefined + let audioContentType: string | undefined + + if (returnAudio) { + try { + const audioResult = await generateSpeechAudio(agentResult.content, { + model: ttsModel, + voice: ttsVoice, + }) + audioBase64 = Buffer.from(audioResult.audio).toString("base64") + audioContentType = audioResult.contentType + } catch (ttsError: any) { + diagnosticsService.logWarning("remote-server", "TTS generation failed in streaming mode", ttsError) + } + } + + writeSSE({ + type: "done", + data: { + transcription: transcription.text, + content: agentResult.content, + conversation_id: agentResult.conversationId, + conversation_history: agentResult.conversationHistory, + model, + audio: audioBase64, + audio_content_type: audioContentType, + }, + }) + } catch (agentError: any) { + writeSSE({ + type: "error", + data: { message: agentError?.message || "Agent processing failed" }, + }) + } finally { + reply.raw.end() + } + + return reply + } + + // Non-streaming mode + const agentResult = await runAgent({ + prompt: transcription.text, + conversationId, + }) + + recordHistory(agentResult.content) + const model = resolveActiveModelId(configStore.get()) + + // Optionally generate audio response + let audioBase64: string | undefined + let audioContentType: string | undefined + + if (returnAudio) { + try { + const audioResult = await generateSpeechAudio(agentResult.content, { + model: ttsModel, + voice: ttsVoice, + }) + audioBase64 = Buffer.from(audioResult.audio).toString("base64") + audioContentType = audioResult.contentType + } catch (ttsError: any) { + diagnosticsService.logWarning("remote-server", "TTS generation failed", ttsError) + // Continue without audio - don't fail the whole request + } + } + + return reply.send({ + transcription: transcription.text, + content: agentResult.content, + conversation_id: agentResult.conversationId, + conversation_history: agentResult.conversationHistory, + model, + audio: audioBase64, + audio_content_type: audioContentType, + }) + } catch (error: any) { + diagnosticsService.logError("remote-server", "Audio chat failed", error) + return reply.code(500).send({ error: error?.message || "Audio chat failed" }) + } + }) + // ============================================ // Settings Management Endpoints (for mobile app) // ============================================ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 36671a1bc..5c8cbc7e2 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -25,6 +25,9 @@ importers: '@fastify/cors': specifier: ^11.1.0 version: 11.2.0 + '@fastify/multipart': + specifier: ^9.3.0 + version: 9.3.0 '@modelcontextprotocol/sdk': specifier: ^1.24.3 version: 1.25.1(hono@4.11.1)(zod@3.25.76) @@ -1347,9 +1350,15 @@ packages: '@fastify/ajv-compiler@4.0.5': resolution: {integrity: sha512-KoWKW+MhvfTRWL4qrhUwAAZoaChluo0m0vbiJlGMt2GXvL4LVPQEjt8kSpHI3IBq5Rez8fg+XeH3cneztq+C7A==} + '@fastify/busboy@3.2.0': + resolution: {integrity: sha512-m9FVDXU3GT2ITSe0UaMA5rU3QkfC/UXtCU8y0gSN/GugTqtVldOBWIB5V6V3sbmenVZUIpU6f+mPEO2+m5iTaA==} + '@fastify/cors@11.2.0': resolution: {integrity: sha512-LbLHBuSAdGdSFZYTLVA3+Ch2t+sA6nq3Ejc6XLAKiQ6ViS2qFnvicpj0htsx03FyYeLs04HfRNBsz/a8SvbcUw==} + '@fastify/deepmerge@3.1.0': + resolution: {integrity: sha512-lCVONBQINyNhM6LLezB6+2afusgEYR4G8xenMsfe+AT+iZ7Ca6upM5Ha8UkZuYSnuMw3GWl/BiPXnLMi/gSxuQ==} + '@fastify/error@4.2.0': resolution: {integrity: sha512-RSo3sVDXfHskiBZKBPRgnQTtIqpi/7zhJOEmAxCiBcM7d0uwdGdxLlsCaLzGs8v8NnxIRlfG0N51p5yFaOentQ==} @@ -1362,6 +1371,9 @@ packages: '@fastify/merge-json-schemas@0.2.1': resolution: {integrity: sha512-OA3KGBCy6KtIvLf8DINC5880o5iBlDX4SxzLQS8HorJAbqluzLRn80UXU0bxZn7UOFhFgpRJDasfwn9nG4FG4A==} + '@fastify/multipart@9.3.0': + resolution: {integrity: sha512-NpeKipTOjjL1dA7SSlRMrOWWtrE8/0yKOmeudkdQoEaz4sVDJw5MVdZIahsWhvpc3YTN7f04f9ep/Y65RKoOWA==} + '@fastify/proxy-addr@5.1.0': resolution: {integrity: sha512-INS+6gh91cLUjB+PVHfu1UqcB76Sqtpyp7bnL+FYojhjygvOPA9ctiD/JDKsyD9Xgu4hUhCSJBPig/w7duNajw==} @@ -8083,11 +8095,15 @@ snapshots: ajv-formats: 3.0.1(ajv@8.17.1) fast-uri: 3.1.0 + '@fastify/busboy@3.2.0': {} + '@fastify/cors@11.2.0': dependencies: fastify-plugin: 5.1.0 toad-cache: 3.7.0 + '@fastify/deepmerge@3.1.0': {} + '@fastify/error@4.2.0': {} '@fastify/fast-json-stringify-compiler@5.0.3': @@ -8100,6 +8116,14 @@ snapshots: dependencies: dequal: 2.0.3 + '@fastify/multipart@9.3.0': + dependencies: + '@fastify/busboy': 3.2.0 + '@fastify/deepmerge': 3.1.0 + '@fastify/error': 4.2.0 + fastify-plugin: 5.1.0 + secure-json-parse: 4.1.0 + '@fastify/proxy-addr@5.1.0': dependencies: '@fastify/forwarded': 3.0.1