aj47 · aj47 · Oct 31, 2025 · Oct 31, 2025 · Dec 20, 2025 · Dec 20, 2025
diff --git a/apps/desktop/speakmcp-rs/Cargo.lock b/apps/desktop/speakmcp-rs/Cargo.lock
diff --git a/apps/desktop/src/main/config.ts b/apps/desktop/src/main/config.ts
@@ -29,6 +29,8 @@ const getConfig = () => {
     mcpAutoPasteDelay: 1000, // 1 second delay by default
     mcpMaxIterations: 10, // Default max iterations for agent mode
     textInputEnabled: true,
+    alwaysIncludeScreenshot: false,
+    screenshotForVoiceCommands: false, // Auto-capture screenshot when using voice commands (MCP/agent mode)
 
     // Text input: On Windows, use Ctrl+Shift+T to avoid browser new tab conflict
     textInputShortcut: isWindows ? "ctrl-shift-t" : "ctrl-t",

diff --git a/apps/desktop/src/main/context-budget.ts b/apps/desktop/src/main/context-budget.ts
@@ -4,7 +4,28 @@ import { makeTextCompletionWithFetch } from "./llm-fetch"
 import { constructMinimalSystemPrompt } from "./system-prompts"
 import { agentSessionStateManager } from "./state"
 
-export type LLMMessage = { role: string; content: string }
+export type LLMMessage = { role: string; content: string | any[] }
+
+// Helper function to get content length that handles both string and array
+function getContentLength(content: string | any[] | undefined): number {
+  if (!content) return 0
+  if (typeof content === 'string') return content.length
+  // For multimodal content, sum up text parts only
+  return content
+    .filter((part: any) => part.type === 'text')
+    .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
+}
+
+// Helper function to get content as string for summarization
+function getContentAsString(content: string | any[] | undefined): string {
+  if (!content) return ''
+  if (typeof content === 'string') return content
+  // For multimodal content, extract text parts only
+  return content
+    .filter((part: any) => part.type === 'text')
+    .map((part: any) => part.text || '')
+    .join('\n')
+}
 
 // Simple in-memory cache for provider/model context windows
 const contextWindowCache = new Map<string, number>()
@@ -77,7 +98,7 @@ async function getMaxContextTokens(providerId: string, model: string): Promise<n
 
 function estimateTokensFromMessages(messages: LLMMessage[]): number {
   // Rough estimate: 4 chars ≈ 1 token
-  const totalChars = messages.reduce((sum, m) => sum + (m.content?.length || 0), 0)
+  const totalChars = messages.reduce((sum, m) => sum + getContentLength(m.content), 0)
   return Math.ceil(totalChars / 4)
 }
 
@@ -196,14 +217,15 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
   const AGGRESSIVE_TRUNCATE_THRESHOLD = 5000
   for (let i = 0; i < messages.length; i++) {
     const msg = messages[i]
-    if (msg.role === "user" && msg.content && msg.content.length > AGGRESSIVE_TRUNCATE_THRESHOLD) {
+    if (msg.role === "user" && getContentLength(msg.content) > AGGRESSIVE_TRUNCATE_THRESHOLD) {
       // Check if this looks like a tool result (contains JSON arrays/objects)
-      if (msg.content.includes('"url":') || msg.content.includes('"id":')) {
+      const contentStr = getContentAsString(msg.content)
+      if (contentStr.includes('"url":') || contentStr.includes('"id":')) {
         // Truncate aggressively and add note
         messages[i] = {
           ...msg,
-          content: msg.content.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
-                   '\n\n... (truncated ' + (msg.content.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
+          content: contentStr.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
+                   '\n\n... (truncated ' + (contentStr.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
                    ' characters for context management. Key information preserved above.)'
         }
         applied.push("aggressive_truncate")
@@ -218,7 +240,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
 
   // Tier 1: Summarize large messages (prefer tool outputs or very long entries)
   const indicesByLength = messages
-    .map((m, i) => ({ i, len: m.content?.length || 0, role: m.role, content: m.content }))
+    .map((m, i) => ({ i, len: getContentLength(m.content), role: m.role, content: m.content }))
     .filter((x) => x.len > summarizeThreshold && x.role !== "system")
     .sort((a, b) => b.len - a.len)
 
@@ -234,15 +256,16 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
     // Emit progress update before summarization
     summarizedCount++
     if (opts.onSummarizationProgress) {
-      const messagePreview = item.content!.substring(0, 100).replace(/\n/g, ' ')
+      const contentStr = getContentAsString(item.content)
+      const messagePreview = contentStr.substring(0, 100).replace(/\n/g, ' ')
       opts.onSummarizationProgress(
         summarizedCount,
         totalToSummarize,
         `Summarizing large message ${summarizedCount}/${totalToSummarize} (${item.len} chars): ${messagePreview}...`
       )
     }
 
-    const summarized = await summarizeContent(item.content!, opts.sessionId)
+    const summarized = await summarizeContent(getContentAsString(item.content), opts.sessionId)
     messages[item.i] = { ...messages[item.i], content: summarized }
     applied.push("summarize")
     tokens = estimateTokensFromMessages(messages)

diff --git a/apps/desktop/src/main/index.ts b/apps/desktop/src/main/index.ts
@@ -1,4 +1,4 @@
-import { app, Menu } from "electron"
+import { app, Menu, ipcMain, desktopCapturer } from "electron"
 import { electronApp, optimizer } from "@electron-toolkit/utils"
 import {
   createMainWindow,
@@ -42,6 +42,54 @@ app.whenReady().then(() => {
   registerIpcMain(router)
   logApp("IPC main registered")
 
+  // Register desktopCapturer handler (available only in main process in Electron 31+)
+  ipcMain.handle('getScreenSources', async (_event, options: { types: ('screen' | 'window')[], thumbnailSize?: { width: number, height: number } }) => {
+    try {
+      // Validate and sanitize options
+      const validatedOptions = {
+        // Only allow 'screen' type for privacy - filter out 'window'
+        types: (options.types || ['screen']).filter(t => t === 'screen') as ('screen' | 'window')[],
+        thumbnailSize: {
+          // Clamp dimensions to reasonable bounds
+          width: Math.min(Math.max(options.thumbnailSize?.width || 1920, 100), 4096),
+          height: Math.min(Math.max(options.thumbnailSize?.height || 1080, 100), 4096)
+        }
+      }
+
+      // Ensure at least 'screen' type is present
+      if (validatedOptions.types.length === 0) {
+        validatedOptions.types = ['screen']
+      }
+
+      logApp('[getScreenSources] Capturing screen sources with validated options:', JSON.stringify(validatedOptions))
+      const sources = await desktopCapturer.getSources(validatedOptions)
+      logApp(`[getScreenSources] Got ${sources.length} sources`)
+
+      // On macOS, if Screen Recording permission is not granted, desktopCapturer returns an empty array
+      // This is a silent failure - no error is thrown
+      if (sources.length === 0 && process.platform === 'darwin') {
+        throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.')
+      }
+
+      // Serialize the sources - NativeImage thumbnail needs to be converted
+      const serialized = sources.map(source => {
+        const thumbnailDataUrl = source.thumbnail.toDataURL()
+        logApp(`[getScreenSources] Source: ${source.name}, thumbnail size: ${thumbnailDataUrl.length} chars`)
+        return {
+          id: source.id,
+          name: source.name,
+          thumbnail: thumbnailDataUrl,
+          display_id: source.display_id,
+          appIcon: source.appIcon ? source.appIcon.toDataURL() : null
+        }
+      })
+      return serialized
+    } catch (error) {
+      console.error('Failed to get screen sources:', error)
+      throw error
+    }
+  })
+
   registerServeProtocol()
 
 	  try {

diff --git a/apps/desktop/src/main/llm-fetch.ts b/apps/desktop/src/main/llm-fetch.ts
@@ -5,6 +5,38 @@ import { isDebugLLM, logLLM } from "./debug"
 import { state, llmRequestAbortManager, agentSessionStateManager } from "./state"
 import OpenAI from "openai"
 
+/**
+ * Helper function to get a string preview from content that may be a string or multimodal array
+ */
+function getContentPreview(content: string | any[] | undefined, maxLength: number = 100): string {
+  if (!content) return "(empty)"
+  if (typeof content === "string") {
+    return content.length > maxLength ? content.substring(0, maxLength) + "..." : content
+  }
+  // It's an array (multimodal content)
+  const textParts = content
+    .filter((part: any) => part.type === "text")
+    .map((part: any) => part.text)
+    .join(" ")
+  const hasImage = content.some((part: any) => part.type === "image_url")
+  const preview = textParts.length > maxLength ? textParts.substring(0, maxLength) + "..." : textParts
+  return hasImage ? `[image] ${preview}` : preview
+}
+
+/**
+ * Helper function to get content length from content that may be a string or multimodal array
+ */
+function getContentLength(content: string | any[] | undefined): number {
+  if (!content) return 0
+  if (typeof content === "string") {
+    return content.length
+  }
+  // It's an array (multimodal content) - sum up text lengths
+  return content
+    .filter((part: any) => part.type === "text")
+    .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
+}
+
 /**
  * Callback for reporting retry progress to the UI
  */
@@ -658,23 +690,22 @@ async function makeAPICallAttempt(
       messagesCount: requestBody.messages.length,
       responseFormat: requestBody.response_format,
       estimatedTokens,
-      totalPromptLength: (requestBody.messages as Array<{ role: string; content: string }>).reduce(
-        (sum: number, msg: { role: string; content: string }) => sum + ((msg.content?.length) || 0),
+      totalPromptLength: (requestBody.messages as Array<{ role: string; content: string | any[] }>).reduce(
+        (sum: number, msg: { role: string; content: string | any[] }) => sum + getContentLength(msg.content),
         0,
       ),
       contextWarning: estimatedTokens > 8000 ? "WARNING: High token count, may exceed context limit" : null
     })
     logLLM("Request Body (truncated)", {
       ...requestBody,
-      messages: (requestBody.messages as Array<{ role: string; content: string }>).map(
-        (msg: { role: string; content: string }) => ({
+      messages: (requestBody.messages as Array<{ role: string; content: string | any[] }>).map(
+        (msg: { role: string; content: string | any[] }) => ({
           role: msg.role,
-          content: msg.content.length > 200
-            ? msg.content.substring(0, 200) + "... [" + msg.content.length + " chars]"
-            : msg.content,
+          content: getContentPreview(msg.content, 200),
         }),
       )
     })
+
   }
 
   // Create abort controller and register it so emergency stop can cancel
@@ -841,7 +872,7 @@ async function makeAPICallAttempt(
  * Make a fetch-based LLM call for OpenAI-compatible APIs with structured output fallback
  */
 async function makeOpenAICompatibleCall(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   providerId: string,
   useStructuredOutput: boolean = true,
   sessionId?: string,
@@ -861,7 +892,10 @@ async function makeOpenAICompatibleCall(
   }
 
   const model = getModel(providerId, "mcp")
-  const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => sum + msg.content.length, 0) / 4)
+  const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => {
+    const contentLength = typeof msg.content === 'string' ? msg.content.length : JSON.stringify(msg.content).length
+    return sum + contentLength
+  }, 0) / 4)
 
   const baseRequestBody = {
     model,
@@ -1016,7 +1050,7 @@ async function makeOpenAICompatibleCall(
  * Make a fetch-based LLM call for Gemini API
  */
 async function makeGeminiCall(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   sessionId?: string,
   onRetryProgress?: RetryProgressCallback,
 ): Promise<any> {
@@ -1030,8 +1064,30 @@ async function makeGeminiCall(
   const baseURL =
     config.geminiBaseUrl || "https://generativelanguage.googleapis.com"
 
+  // Helper to extract text from multimodal content without embedding full image data
+  const extractTextFromContent = (content: string | any[]): string => {
+    if (typeof content === 'string') {
+      return content
+    }
+    if (Array.isArray(content)) {
+      return content.map(part => {
+        if (part.type === 'text') {
+          return part.text || ''
+        }
+        if (part.type === 'image_url') {
+          return '[image attached]'
+        }
+        return ''
+      }).filter(Boolean).join(' ')
+    }
+    return String(content)
+  }
+
   // Convert messages to Gemini format
-  const prompt = messages.map((m) => `${m.role}: ${m.content}`).join("\n\n")
+  const prompt = messages.map((m) => {
+    const content = extractTextFromContent(m.content)
+    return `${m.role}: ${content}`
+  }).join("\n\n")
 
   return apiCallWithRetry(async () => {
     if (isDebugLLM()) {
@@ -1144,7 +1200,7 @@ async function makeGeminiCall(
  * This is wrapped by makeLLMCallWithFetch with retry logic
  */
 async function makeLLMCallAttempt(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   chatProviderId: string,
   onRetryProgress?: RetryProgressCallback,
   sessionId?: string,
@@ -1153,7 +1209,7 @@ async function makeLLMCallAttempt(
     logLLM("🚀 Starting LLM call attempt", {
       provider: chatProviderId,
       messagesCount: messages.length,
-      lastMessagePreview: messages[messages.length - 1]?.content?.substring(0, 100) + "..."
+      lastMessagePreview: getContentPreview(messages[messages.length - 1]?.content, 100)
     })
   }
 
@@ -1323,7 +1379,7 @@ async function makeLLMCallAttempt(
  * Main function to make LLM calls using fetch with automatic retry on empty responses
  */
 export async function makeLLMCallWithFetch(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   providerId?: string,
   onRetryProgress?: RetryProgressCallback,
   sessionId?: string,