diff --git a/apps/desktop/package.json b/apps/desktop/package.json
index ed30b52b4..28e9a406a 100644
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@@ -45,6 +45,7 @@
     "@ai-sdk/openai": "^3.0.1",
     "@egoist/electron-panel-window": "^8.0.3",
     "@fastify/cors": "^11.1.0",
+    "@fastify/multipart": "^9.3.0",
     "@modelcontextprotocol/sdk": "^1.24.3",
     "@radix-ui/react-dropdown-menu": "^2.1.16",
     "@radix-ui/react-scroll-area": "^1.2.9",
diff --git a/apps/desktop/src/main/remote-server.ts b/apps/desktop/src/main/remote-server.ts
index d7327cd4a..3c3d9696d 100644
--- a/apps/desktop/src/main/remote-server.ts
+++ b/apps/desktop/src/main/remote-server.ts
@@ -1,9 +1,10 @@
 import Fastify, { FastifyInstance } from "fastify"
 import cors from "@fastify/cors"
+import multipart from "@fastify/multipart"
 import crypto from "crypto"
 import fs from "fs"
 import path from "path"
-import { configStore, recordingsFolder } from "./config"
+import { configStore, recordingsFolder, Config } from "./config"
 import { diagnosticsService } from "./diagnostics"
 import { mcpService, MCPToolResult } from "./mcp-service"
 import { processTranscriptWithAgentMode } from "./llm"
@@ -13,6 +14,7 @@ import { AgentProgressUpdate, SessionProfileSnapshot } from "../shared/types"
 import { agentSessionTracker } from "./agent-session-tracker"
 import { emergencyStopAll } from "./emergency-stop"
 import { profileService } from "./profile-service"
+import { preprocessTextForTTS, validateTTSText } from "@speakmcp/shared"
 
 let server: FastifyInstance | null = null
 let lastError: string | undefined
@@ -96,6 +98,241 @@ function extractUserPrompt(body: any): string | null {
   }
 }
 
+// ============================================
+// Audio Processing Helper Functions
+// ============================================
+
+/**
+ * Transcribe audio using configured STT provider (OpenAI or Groq Whisper)
+ */
+async function transcribeAudio(
+  audioBuffer: Buffer,
+  filename: string,
+  options?: {
+    model?: string
+    language?: string
+    prompt?: string
+    response_format?: string
+  }
+): Promise<{ text: string }> {
+  const config = configStore.get()
+
+  // Determine mime type from filename
+  const ext = path.extname(filename).toLowerCase()
+  const mimeTypes: Record<string, string> = {
+    ".mp3": "audio/mpeg",
+    ".mp4": "audio/mp4",
+    ".m4a": "audio/mp4",
+    ".wav": "audio/wav",
+    ".webm": "audio/webm",
+    ".ogg": "audio/ogg",
+    ".flac": "audio/flac",
+  }
+  const mimeType = mimeTypes[ext] || "audio/webm"
+
+  const form = new FormData()
+  form.append("file", new File([audioBuffer], filename, { type: mimeType }))
+
+  // Use specified model or default based on provider
+  const model = options?.model ||
+    (config.sttProviderId === "groq" ? "whisper-large-v3-turbo" : "whisper-1")
+  form.append("model", model)
+  form.append("response_format", options?.response_format || "json")
+
+  // Add prompt if provided (for Groq)
+  if (options?.prompt?.trim()) {
+    form.append("prompt", options.prompt.trim())
+  } else if (config.sttProviderId === "groq" && config.groqSttPrompt?.trim()) {
+    form.append("prompt", config.groqSttPrompt.trim())
+  }
+
+  // Add language if specified
+  const languageCode = options?.language ||
+    (config.sttProviderId === "groq"
+      ? config.groqSttLanguage || config.sttLanguage
+      : config.openaiSttLanguage || config.sttLanguage)
+
+  if (languageCode && languageCode !== "auto") {
+    form.append("language", languageCode)
+  }
+
+  const groqBaseUrl = config.groqBaseUrl || "https://api.groq.com/openai/v1"
+  const openaiBaseUrl = config.openaiBaseUrl || "https://api.openai.com/v1"
+
+  const transcriptResponse = await fetch(
+    config.sttProviderId === "groq"
+      ? `${groqBaseUrl}/audio/transcriptions`
+      : `${openaiBaseUrl}/audio/transcriptions`,
+    {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${config.sttProviderId === "groq" ? config.groqApiKey : config.openaiApiKey}`,
+      },
+      body: form,
+    }
+  )
+
+  if (!transcriptResponse.ok) {
+    const errorText = await transcriptResponse.text()
+    throw new Error(`Transcription failed: ${transcriptResponse.statusText} - ${errorText.slice(0, 300)}`)
+  }
+
+  const result = await transcriptResponse.json()
+  return { text: result.text || "" }
+}
+
+/**
+ * Generate speech audio from text using configured TTS provider
+ */
+async function generateSpeechAudio(
+  text: string,
+  options?: {
+    model?: string
+    voice?: string
+    speed?: number
+    response_format?: string
+  }
+): Promise<{ audio: ArrayBuffer; contentType: string }> {
+  const config = configStore.get()
+  const providerId = config.ttsProviderId || "openai"
+
+  // Preprocess text for TTS
+  let processedText = text
+  if (config.ttsPreprocessingEnabled !== false) {
+    const preprocessingOptions = {
+      removeCodeBlocks: config.ttsRemoveCodeBlocks ?? true,
+      removeUrls: config.ttsRemoveUrls ?? true,
+      convertMarkdown: config.ttsConvertMarkdown ?? true,
+    }
+    processedText = preprocessTextForTTS(text, preprocessingOptions)
+  }
+
+  // Validate processed text
+  const validation = validateTTSText(processedText)
+  if (!validation.isValid) {
+    throw new Error(`TTS validation failed: ${validation.issues.join(", ")}`)
+  }
+
+  let audioBuffer: ArrayBuffer
+  let contentType: string
+
+  if (providerId === "openai") {
+    const model = options?.model || config.openaiTtsModel || "tts-1"
+    const voice = options?.voice || config.openaiTtsVoice || "alloy"
+    const speed = options?.speed || config.openaiTtsSpeed || 1.0
+    const responseFormat = options?.response_format || config.openaiTtsResponseFormat || "mp3"
+
+    const baseUrl = config.openaiBaseUrl || "https://api.openai.com/v1"
+    const apiKey = config.openaiApiKey
+
+    if (!apiKey) {
+      throw new Error("OpenAI API key is required for TTS")
+    }
+
+    const response = await fetch(`${baseUrl}/audio/speech`, {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({ model, input: processedText, voice, speed, response_format: responseFormat }),
+    })
+
+    if (!response.ok) {
+      const errorText = await response.text()
+      throw new Error(`OpenAI TTS API error: ${response.statusText} - ${errorText}`)
+    }
+
+    audioBuffer = await response.arrayBuffer()
+    contentType = responseFormat === "opus" ? "audio/opus" :
+                  responseFormat === "aac" ? "audio/aac" :
+                  responseFormat === "flac" ? "audio/flac" :
+                  responseFormat === "wav" ? "audio/wav" :
+                  responseFormat === "pcm" ? "audio/pcm" : "audio/mpeg"
+  } else if (providerId === "groq") {
+    const model = options?.model || config.groqTtsModel || "canopylabs/orpheus-v1-english"
+    const defaultVoice = model === "canopylabs/orpheus-arabic-saudi" ? "fahad" : "troy"
+    const voice = options?.voice || config.groqTtsVoice || defaultVoice
+
+    const baseUrl = config.groqBaseUrl || "https://api.groq.com/openai/v1"
+    const apiKey = config.groqApiKey
+
+    if (!apiKey) {
+      throw new Error("Groq API key is required for TTS")
+    }
+
+    const response = await fetch(`${baseUrl}/audio/speech`, {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({ model, input: processedText, voice, response_format: "wav" }),
+    })
+
+    if (!response.ok) {
+      const errorText = await response.text()
+      if (errorText.includes("requires terms acceptance")) {
+        const modelParam = model === "canopylabs/orpheus-arabic-saudi"
+          ? "canopylabs%2Forpheus-arabic-saudi"
+          : "canopylabs%2Forpheus-v1-english"
+        throw new Error(`Groq TTS requires terms acceptance. Visit https://console.groq.com/playground?model=${modelParam}`)
+      }
+      throw new Error(`Groq TTS API error: ${response.statusText} - ${errorText}`)
+    }
+
+    audioBuffer = await response.arrayBuffer()
+    contentType = "audio/wav"
+  } else if (providerId === "gemini") {
+    const model = options?.model || config.geminiTtsModel || "gemini-2.5-flash-preview-tts"
+    const voice = options?.voice || config.geminiTtsVoice || "Kore"
+
+    const baseUrl = config.geminiBaseUrl || "https://generativelanguage.googleapis.com"
+    const apiKey = config.geminiApiKey
+
+    if (!apiKey) {
+      throw new Error("Gemini API key is required for TTS")
+    }
+
+    const response = await fetch(`${baseUrl}/v1beta/models/${model}:generateContent?key=${apiKey}`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        contents: [{ parts: [{ text: processedText }] }],
+        generationConfig: {
+          responseModalities: ["AUDIO"],
+          speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: voice } } }
+        }
+      }),
+    })
+
+    if (!response.ok) {
+      const errorText = await response.text()
+      throw new Error(`Gemini TTS API error: ${response.statusText} - ${errorText}`)
+    }
+
+    const result = await response.json()
+    const audioData = result.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data
+
+    if (!audioData) {
+      throw new Error("No audio data received from Gemini TTS API")
+    }
+
+    // Convert base64 to ArrayBuffer
+    const binaryString = atob(audioData)
+    const bytes = new Uint8Array(binaryString.length)
+    for (let i = 0; i < binaryString.length; i++) {
+      bytes[i] = binaryString.charCodeAt(i)
+    }
+    audioBuffer = bytes.buffer
+    contentType = "audio/wav"
+  } else {
+    throw new Error(`Unsupported TTS provider: ${providerId}`)
+  }
+
+  return { audio: audioBuffer, contentType }
+}
+
 interface RunAgentOptions {
   prompt: string
   conversationId?: string
@@ -402,6 +639,14 @@ export async function startRemoteServer() {
     strictPreflight: false, // Don't be strict about preflight requests
   })
 
+  // Configure multipart for audio file uploads
+  await fastify.register(multipart, {
+    limits: {
+      fileSize: 25 * 1024 * 1024, // 25MB max (OpenAI limit)
+      files: 1,
+    },
+  })
+
   // Auth hook (skip for OPTIONS preflight requests)
   fastify.addHook("onRequest", async (req, reply) => {
     // Skip auth for OPTIONS requests (CORS preflight)
@@ -549,6 +794,246 @@ export async function startRemoteServer() {
     }
   })
 
+  // ============================================
+  // Audio Endpoints (OpenAI-compatible)
+  // ============================================
+
+  // POST /v1/audio/transcriptions - Transcribe audio to text (OpenAI-compatible)
+  fastify.post("/v1/audio/transcriptions", async (req, reply) => {
+    try {
+      const data = await req.file()
+      if (!data) {
+        return reply.code(400).send({ error: "No audio file provided" })
+      }
+
+      const audioBuffer = await data.toBuffer()
+      const filename = data.filename || "audio.webm"
+
+      // Extract optional parameters from multipart fields
+      const fields = data.fields as Record<string, any>
+      const model = fields?.model?.value
+      const language = fields?.language?.value
+      const prompt = fields?.prompt?.value
+      const responseFormat = fields?.response_format?.value || "json"
+
+      diagnosticsService.logInfo("remote-server", `Transcribing audio: ${filename} (${audioBuffer.length} bytes)`)
+
+      const result = await transcribeAudio(audioBuffer, filename, {
+        model,
+        language,
+        prompt,
+        response_format: responseFormat,
+      })
+
+      diagnosticsService.logInfo("remote-server", `Transcription complete: ${result.text.length} chars`)
+
+      // Return in OpenAI-compatible format
+      if (responseFormat === "text") {
+        return reply.type("text/plain").send(result.text)
+      } else if (responseFormat === "verbose_json") {
+        return reply.send({
+          task: "transcribe",
+          language: language || "en",
+          duration: 0, // We don't have this info
+          text: result.text,
+        })
+      }
+      // Default JSON format
+      return reply.send({ text: result.text })
+    } catch (error: any) {
+      diagnosticsService.logError("remote-server", "Transcription failed", error)
+      return reply.code(500).send({ error: error?.message || "Transcription failed" })
+    }
+  })
+
+  // POST /v1/audio/speech - Generate speech from text (OpenAI-compatible)
+  fastify.post("/v1/audio/speech", async (req, reply) => {
+    try {
+      const body = req.body as any
+      const input = body?.input
+
+      if (!input || typeof input !== "string") {
+        return reply.code(400).send({ error: "Missing or invalid 'input' text" })
+      }
+
+      const model = body?.model
+      const voice = body?.voice
+      const speed = body?.speed
+      const responseFormat = body?.response_format
+
+      diagnosticsService.logInfo("remote-server", `Generating speech: ${input.length} chars`)
+
+      const result = await generateSpeechAudio(input, {
+        model,
+        voice,
+        speed,
+        response_format: responseFormat,
+      })
+
+      diagnosticsService.logInfo("remote-server", `Speech generated: ${result.audio.byteLength} bytes`)
+
+      // Return audio binary directly
+      return reply
+        .type(result.contentType)
+        .send(Buffer.from(result.audio))
+    } catch (error: any) {
+      diagnosticsService.logError("remote-server", "Speech generation failed", error)
+      return reply.code(500).send({ error: error?.message || "Speech generation failed" })
+    }
+  })
+
+  // POST /v1/audio/chat - Combined: transcribe audio, run agent, return text + optional audio
+  // This is a custom endpoint that combines STT -> Agent -> TTS in one call
+  fastify.post("/v1/audio/chat", async (req, reply) => {
+    try {
+      const data = await req.file()
+      if (!data) {
+        return reply.code(400).send({ error: "No audio file provided" })
+      }
+
+      const audioBuffer = await data.toBuffer()
+      const filename = data.filename || "audio.webm"
+
+      // Extract optional parameters from multipart fields
+      const fields = data.fields as Record<string, any>
+      const conversationId = fields?.conversation_id?.value
+      const returnAudio = fields?.return_audio?.value === "true" || fields?.return_audio?.value === true
+      const sttModel = fields?.stt_model?.value
+      const sttLanguage = fields?.language?.value
+      const ttsModel = fields?.tts_model?.value
+      const ttsVoice = fields?.voice?.value
+      const isStreaming = fields?.stream?.value === "true" || fields?.stream?.value === true
+
+      diagnosticsService.logInfo("remote-server", `Audio chat: ${filename} (${audioBuffer.length} bytes), returnAudio=${returnAudio}`)
+
+      // Step 1: Transcribe audio
+      const transcription = await transcribeAudio(audioBuffer, filename, {
+        model: sttModel,
+        language: sttLanguage,
+      })
+
+      if (!transcription.text.trim()) {
+        return reply.code(400).send({ error: "Could not transcribe audio - no speech detected" })
+      }
+
+      diagnosticsService.logInfo("remote-server", `Transcribed: "${transcription.text.substring(0, 100)}..."`)
+
+      // Step 2: Run agent with transcribed text
+      if (isStreaming) {
+        // SSE streaming mode
+        const requestOrigin = req.headers.origin || "*"
+        reply.raw.writeHead(200, {
+          "Content-Type": "text/event-stream",
+          "Cache-Control": "no-cache",
+          "Connection": "keep-alive",
+          "Access-Control-Allow-Origin": requestOrigin,
+          "Access-Control-Allow-Credentials": "true",
+        })
+
+        const writeSSE = (eventData: object) => {
+          reply.raw.write(`data: ${JSON.stringify(eventData)}\n\n`)
+        }
+
+        // Send transcription event
+        writeSSE({ type: "transcription", data: { text: transcription.text } })
+
+        const onProgress = (update: AgentProgressUpdate) => {
+          writeSSE({ type: "progress", data: update })
+        }
+
+        try {
+          const agentResult = await runAgent({
+            prompt: transcription.text,
+            conversationId,
+            onProgress,
+          })
+
+          recordHistory(agentResult.content)
+          const model = resolveActiveModelId(configStore.get())
+
+          // Optionally generate audio response
+          let audioBase64: string | undefined
+          let audioContentType: string | undefined
+
+          if (returnAudio) {
+            try {
+              const audioResult = await generateSpeechAudio(agentResult.content, {
+                model: ttsModel,
+                voice: ttsVoice,
+              })
+              audioBase64 = Buffer.from(audioResult.audio).toString("base64")
+              audioContentType = audioResult.contentType
+            } catch (ttsError: any) {
+              diagnosticsService.logWarning("remote-server", "TTS generation failed in streaming mode", ttsError)
+            }
+          }
+
+          writeSSE({
+            type: "done",
+            data: {
+              transcription: transcription.text,
+              content: agentResult.content,
+              conversation_id: agentResult.conversationId,
+              conversation_history: agentResult.conversationHistory,
+              model,
+              audio: audioBase64,
+              audio_content_type: audioContentType,
+            },
+          })
+        } catch (agentError: any) {
+          writeSSE({
+            type: "error",
+            data: { message: agentError?.message || "Agent processing failed" },
+          })
+        } finally {
+          reply.raw.end()
+        }
+
+        return reply
+      }
+
+      // Non-streaming mode
+      const agentResult = await runAgent({
+        prompt: transcription.text,
+        conversationId,
+      })
+
+      recordHistory(agentResult.content)
+      const model = resolveActiveModelId(configStore.get())
+
+      // Optionally generate audio response
+      let audioBase64: string | undefined
+      let audioContentType: string | undefined
+
+      if (returnAudio) {
+        try {
+          const audioResult = await generateSpeechAudio(agentResult.content, {
+            model: ttsModel,
+            voice: ttsVoice,
+          })
+          audioBase64 = Buffer.from(audioResult.audio).toString("base64")
+          audioContentType = audioResult.contentType
+        } catch (ttsError: any) {
+          diagnosticsService.logWarning("remote-server", "TTS generation failed", ttsError)
+          // Continue without audio - don't fail the whole request
+        }
+      }
+
+      return reply.send({
+        transcription: transcription.text,
+        content: agentResult.content,
+        conversation_id: agentResult.conversationId,
+        conversation_history: agentResult.conversationHistory,
+        model,
+        audio: audioBase64,
+        audio_content_type: audioContentType,
+      })
+    } catch (error: any) {
+      diagnosticsService.logError("remote-server", "Audio chat failed", error)
+      return reply.code(500).send({ error: error?.message || "Audio chat failed" })
+    }
+  })
+
   // ============================================
   // Settings Management Endpoints (for mobile app)
   // ============================================
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 36671a1bc..5c8cbc7e2 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -25,6 +25,9 @@ importers:
       '@fastify/cors':
         specifier: ^11.1.0
         version: 11.2.0
+      '@fastify/multipart':
+        specifier: ^9.3.0
+        version: 9.3.0
       '@modelcontextprotocol/sdk':
         specifier: ^1.24.3
         version: 1.25.1(hono@4.11.1)(zod@3.25.76)
@@ -1347,9 +1350,15 @@ packages:
   '@fastify/ajv-compiler@4.0.5':
     resolution: {integrity: sha512-KoWKW+MhvfTRWL4qrhUwAAZoaChluo0m0vbiJlGMt2GXvL4LVPQEjt8kSpHI3IBq5Rez8fg+XeH3cneztq+C7A==}
 
+  '@fastify/busboy@3.2.0':
+    resolution: {integrity: sha512-m9FVDXU3GT2ITSe0UaMA5rU3QkfC/UXtCU8y0gSN/GugTqtVldOBWIB5V6V3sbmenVZUIpU6f+mPEO2+m5iTaA==}
+
   '@fastify/cors@11.2.0':
     resolution: {integrity: sha512-LbLHBuSAdGdSFZYTLVA3+Ch2t+sA6nq3Ejc6XLAKiQ6ViS2qFnvicpj0htsx03FyYeLs04HfRNBsz/a8SvbcUw==}
 
+  '@fastify/deepmerge@3.1.0':
+    resolution: {integrity: sha512-lCVONBQINyNhM6LLezB6+2afusgEYR4G8xenMsfe+AT+iZ7Ca6upM5Ha8UkZuYSnuMw3GWl/BiPXnLMi/gSxuQ==}
+
   '@fastify/error@4.2.0':
     resolution: {integrity: sha512-RSo3sVDXfHskiBZKBPRgnQTtIqpi/7zhJOEmAxCiBcM7d0uwdGdxLlsCaLzGs8v8NnxIRlfG0N51p5yFaOentQ==}
 
@@ -1362,6 +1371,9 @@ packages:
   '@fastify/merge-json-schemas@0.2.1':
     resolution: {integrity: sha512-OA3KGBCy6KtIvLf8DINC5880o5iBlDX4SxzLQS8HorJAbqluzLRn80UXU0bxZn7UOFhFgpRJDasfwn9nG4FG4A==}
 
+  '@fastify/multipart@9.3.0':
+    resolution: {integrity: sha512-NpeKipTOjjL1dA7SSlRMrOWWtrE8/0yKOmeudkdQoEaz4sVDJw5MVdZIahsWhvpc3YTN7f04f9ep/Y65RKoOWA==}
+
   '@fastify/proxy-addr@5.1.0':
     resolution: {integrity: sha512-INS+6gh91cLUjB+PVHfu1UqcB76Sqtpyp7bnL+FYojhjygvOPA9ctiD/JDKsyD9Xgu4hUhCSJBPig/w7duNajw==}
 
@@ -8083,11 +8095,15 @@ snapshots:
       ajv-formats: 3.0.1(ajv@8.17.1)
       fast-uri: 3.1.0
 
+  '@fastify/busboy@3.2.0': {}
+
   '@fastify/cors@11.2.0':
     dependencies:
       fastify-plugin: 5.1.0
       toad-cache: 3.7.0
 
+  '@fastify/deepmerge@3.1.0': {}
+
   '@fastify/error@4.2.0': {}
 
   '@fastify/fast-json-stringify-compiler@5.0.3':
@@ -8100,6 +8116,14 @@ snapshots:
     dependencies:
       dequal: 2.0.3
 
+  '@fastify/multipart@9.3.0':
+    dependencies:
+      '@fastify/busboy': 3.2.0
+      '@fastify/deepmerge': 3.1.0
+      '@fastify/error': 4.2.0
+      fastify-plugin: 5.1.0
+      secure-json-parse: 4.1.0
+
   '@fastify/proxy-addr@5.1.0':
     dependencies:
       '@fastify/forwarded': 3.0.1