diff --git a/apps/desktop/speakmcp-rs/Cargo.lock b/apps/desktop/speakmcp-rs/Cargo.lock
index ce98e1740..7b037d323 100644
--- a/apps/desktop/speakmcp-rs/Cargo.lock
+++ b/apps/desktop/speakmcp-rs/Cargo.lock
@@ -431,7 +431,7 @@ dependencies = [
 
 [[package]]
 name = "speakmcp-rs"
-version = "1.0.0"
+version = "1.1.0"
 dependencies = [
  "enigo",
  "rdev",
diff --git a/apps/desktop/src/main/config.ts b/apps/desktop/src/main/config.ts
index 240520bd6..102bd3991 100644
--- a/apps/desktop/src/main/config.ts
+++ b/apps/desktop/src/main/config.ts
@@ -29,6 +29,8 @@ const getConfig = () => {
     mcpAutoPasteDelay: 1000, // 1 second delay by default
     mcpMaxIterations: 10, // Default max iterations for agent mode
     textInputEnabled: true,
+    alwaysIncludeScreenshot: false,
+    screenshotForVoiceCommands: false, // Auto-capture screenshot when using voice commands (MCP/agent mode)
 
     // Text input: On Windows, use Ctrl+Shift+T to avoid browser new tab conflict
     textInputShortcut: isWindows ? "ctrl-shift-t" : "ctrl-t",
diff --git a/apps/desktop/src/main/context-budget.ts b/apps/desktop/src/main/context-budget.ts
index 7c66e518a..0b72fb56c 100644
--- a/apps/desktop/src/main/context-budget.ts
+++ b/apps/desktop/src/main/context-budget.ts
@@ -4,7 +4,28 @@ import { makeTextCompletionWithFetch } from "./llm-fetch"
 import { constructMinimalSystemPrompt } from "./system-prompts"
 import { agentSessionStateManager } from "./state"
 
-export type LLMMessage = { role: string; content: string }
+export type LLMMessage = { role: string; content: string | any[] }
+
+// Helper function to get content length that handles both string and array
+function getContentLength(content: string | any[] | undefined): number {
+  if (!content) return 0
+  if (typeof content === 'string') return content.length
+  // For multimodal content, sum up text parts only
+  return content
+    .filter((part: any) => part.type === 'text')
+    .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
+}
+
+// Helper function to get content as string for summarization
+function getContentAsString(content: string | any[] | undefined): string {
+  if (!content) return ''
+  if (typeof content === 'string') return content
+  // For multimodal content, extract text parts only
+  return content
+    .filter((part: any) => part.type === 'text')
+    .map((part: any) => part.text || '')
+    .join('\n')
+}
 
 // Simple in-memory cache for provider/model context windows
 const contextWindowCache = new Map<string, number>()
@@ -77,7 +98,7 @@ async function getMaxContextTokens(providerId: string, model: string): Promise<n
 
 function estimateTokensFromMessages(messages: LLMMessage[]): number {
   // Rough estimate: 4 chars ≈ 1 token
-  const totalChars = messages.reduce((sum, m) => sum + (m.content?.length || 0), 0)
+  const totalChars = messages.reduce((sum, m) => sum + getContentLength(m.content), 0)
   return Math.ceil(totalChars / 4)
 }
 
@@ -196,14 +217,15 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
   const AGGRESSIVE_TRUNCATE_THRESHOLD = 5000
   for (let i = 0; i < messages.length; i++) {
     const msg = messages[i]
-    if (msg.role === "user" && msg.content && msg.content.length > AGGRESSIVE_TRUNCATE_THRESHOLD) {
+    if (msg.role === "user" && getContentLength(msg.content) > AGGRESSIVE_TRUNCATE_THRESHOLD) {
       // Check if this looks like a tool result (contains JSON arrays/objects)
-      if (msg.content.includes('"url":') || msg.content.includes('"id":')) {
+      const contentStr = getContentAsString(msg.content)
+      if (contentStr.includes('"url":') || contentStr.includes('"id":')) {
         // Truncate aggressively and add note
         messages[i] = {
           ...msg,
-          content: msg.content.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
-                   '\n\n... (truncated ' + (msg.content.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
+          content: contentStr.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
+                   '\n\n... (truncated ' + (contentStr.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
                    ' characters for context management. Key information preserved above.)'
         }
         applied.push("aggressive_truncate")
@@ -218,7 +240,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
 
   // Tier 1: Summarize large messages (prefer tool outputs or very long entries)
   const indicesByLength = messages
-    .map((m, i) => ({ i, len: m.content?.length || 0, role: m.role, content: m.content }))
+    .map((m, i) => ({ i, len: getContentLength(m.content), role: m.role, content: m.content }))
     .filter((x) => x.len > summarizeThreshold && x.role !== "system")
     .sort((a, b) => b.len - a.len)
 
@@ -234,7 +256,8 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
     // Emit progress update before summarization
     summarizedCount++
     if (opts.onSummarizationProgress) {
-      const messagePreview = item.content!.substring(0, 100).replace(/\n/g, ' ')
+      const contentStr = getContentAsString(item.content)
+      const messagePreview = contentStr.substring(0, 100).replace(/\n/g, ' ')
       opts.onSummarizationProgress(
         summarizedCount,
         totalToSummarize,
@@ -242,7 +265,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
       )
     }
 
-    const summarized = await summarizeContent(item.content!, opts.sessionId)
+    const summarized = await summarizeContent(getContentAsString(item.content), opts.sessionId)
     messages[item.i] = { ...messages[item.i], content: summarized }
     applied.push("summarize")
     tokens = estimateTokensFromMessages(messages)
diff --git a/apps/desktop/src/main/index.ts b/apps/desktop/src/main/index.ts
index 617386b4a..3c886f670 100644
--- a/apps/desktop/src/main/index.ts
+++ b/apps/desktop/src/main/index.ts
@@ -1,4 +1,4 @@
-import { app, Menu } from "electron"
+import { app, Menu, ipcMain, desktopCapturer } from "electron"
 import { electronApp, optimizer } from "@electron-toolkit/utils"
 import {
   createMainWindow,
@@ -42,6 +42,54 @@ app.whenReady().then(() => {
   registerIpcMain(router)
   logApp("IPC main registered")
 
+  // Register desktopCapturer handler (available only in main process in Electron 31+)
+  ipcMain.handle('getScreenSources', async (_event, options: { types: ('screen' | 'window')[], thumbnailSize?: { width: number, height: number } }) => {
+    try {
+      // Validate and sanitize options
+      const validatedOptions = {
+        // Only allow 'screen' type for privacy - filter out 'window'
+        types: (options.types || ['screen']).filter(t => t === 'screen') as ('screen' | 'window')[],
+        thumbnailSize: {
+          // Clamp dimensions to reasonable bounds
+          width: Math.min(Math.max(options.thumbnailSize?.width || 1920, 100), 4096),
+          height: Math.min(Math.max(options.thumbnailSize?.height || 1080, 100), 4096)
+        }
+      }
+
+      // Ensure at least 'screen' type is present
+      if (validatedOptions.types.length === 0) {
+        validatedOptions.types = ['screen']
+      }
+
+      logApp('[getScreenSources] Capturing screen sources with validated options:', JSON.stringify(validatedOptions))
+      const sources = await desktopCapturer.getSources(validatedOptions)
+      logApp(`[getScreenSources] Got ${sources.length} sources`)
+
+      // On macOS, if Screen Recording permission is not granted, desktopCapturer returns an empty array
+      // This is a silent failure - no error is thrown
+      if (sources.length === 0 && process.platform === 'darwin') {
+        throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.')
+      }
+
+      // Serialize the sources - NativeImage thumbnail needs to be converted
+      const serialized = sources.map(source => {
+        const thumbnailDataUrl = source.thumbnail.toDataURL()
+        logApp(`[getScreenSources] Source: ${source.name}, thumbnail size: ${thumbnailDataUrl.length} chars`)
+        return {
+          id: source.id,
+          name: source.name,
+          thumbnail: thumbnailDataUrl,
+          display_id: source.display_id,
+          appIcon: source.appIcon ? source.appIcon.toDataURL() : null
+        }
+      })
+      return serialized
+    } catch (error) {
+      console.error('Failed to get screen sources:', error)
+      throw error
+    }
+  })
+
   registerServeProtocol()
 
 	  try {
diff --git a/apps/desktop/src/main/llm-fetch.ts b/apps/desktop/src/main/llm-fetch.ts
index 518f6e6cb..0c6603a39 100644
--- a/apps/desktop/src/main/llm-fetch.ts
+++ b/apps/desktop/src/main/llm-fetch.ts
@@ -5,6 +5,38 @@ import { isDebugLLM, logLLM } from "./debug"
 import { state, llmRequestAbortManager, agentSessionStateManager } from "./state"
 import OpenAI from "openai"
 
+/**
+ * Helper function to get a string preview from content that may be a string or multimodal array
+ */
+function getContentPreview(content: string | any[] | undefined, maxLength: number = 100): string {
+  if (!content) return "(empty)"
+  if (typeof content === "string") {
+    return content.length > maxLength ? content.substring(0, maxLength) + "..." : content
+  }
+  // It's an array (multimodal content)
+  const textParts = content
+    .filter((part: any) => part.type === "text")
+    .map((part: any) => part.text)
+    .join(" ")
+  const hasImage = content.some((part: any) => part.type === "image_url")
+  const preview = textParts.length > maxLength ? textParts.substring(0, maxLength) + "..." : textParts
+  return hasImage ? `[image] ${preview}` : preview
+}
+
+/**
+ * Helper function to get content length from content that may be a string or multimodal array
+ */
+function getContentLength(content: string | any[] | undefined): number {
+  if (!content) return 0
+  if (typeof content === "string") {
+    return content.length
+  }
+  // It's an array (multimodal content) - sum up text lengths
+  return content
+    .filter((part: any) => part.type === "text")
+    .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
+}
+
 /**
  * Callback for reporting retry progress to the UI
  */
@@ -658,23 +690,22 @@ async function makeAPICallAttempt(
       messagesCount: requestBody.messages.length,
       responseFormat: requestBody.response_format,
       estimatedTokens,
-      totalPromptLength: (requestBody.messages as Array<{ role: string; content: string }>).reduce(
-        (sum: number, msg: { role: string; content: string }) => sum + ((msg.content?.length) || 0),
+      totalPromptLength: (requestBody.messages as Array<{ role: string; content: string | any[] }>).reduce(
+        (sum: number, msg: { role: string; content: string | any[] }) => sum + getContentLength(msg.content),
         0,
       ),
       contextWarning: estimatedTokens > 8000 ? "WARNING: High token count, may exceed context limit" : null
     })
     logLLM("Request Body (truncated)", {
       ...requestBody,
-      messages: (requestBody.messages as Array<{ role: string; content: string }>).map(
-        (msg: { role: string; content: string }) => ({
+      messages: (requestBody.messages as Array<{ role: string; content: string | any[] }>).map(
+        (msg: { role: string; content: string | any[] }) => ({
           role: msg.role,
-          content: msg.content.length > 200
-            ? msg.content.substring(0, 200) + "... [" + msg.content.length + " chars]"
-            : msg.content,
+          content: getContentPreview(msg.content, 200),
         }),
       )
     })
+
   }
 
   // Create abort controller and register it so emergency stop can cancel
@@ -841,7 +872,7 @@ async function makeAPICallAttempt(
  * Make a fetch-based LLM call for OpenAI-compatible APIs with structured output fallback
  */
 async function makeOpenAICompatibleCall(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   providerId: string,
   useStructuredOutput: boolean = true,
   sessionId?: string,
@@ -861,7 +892,10 @@ async function makeOpenAICompatibleCall(
   }
 
   const model = getModel(providerId, "mcp")
-  const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => sum + msg.content.length, 0) / 4)
+  const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => {
+    const contentLength = typeof msg.content === 'string' ? msg.content.length : JSON.stringify(msg.content).length
+    return sum + contentLength
+  }, 0) / 4)
 
   const baseRequestBody = {
     model,
@@ -1016,7 +1050,7 @@ async function makeOpenAICompatibleCall(
  * Make a fetch-based LLM call for Gemini API
  */
 async function makeGeminiCall(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   sessionId?: string,
   onRetryProgress?: RetryProgressCallback,
 ): Promise<any> {
@@ -1030,8 +1064,30 @@ async function makeGeminiCall(
   const baseURL =
     config.geminiBaseUrl || "https://generativelanguage.googleapis.com"
 
+  // Helper to extract text from multimodal content without embedding full image data
+  const extractTextFromContent = (content: string | any[]): string => {
+    if (typeof content === 'string') {
+      return content
+    }
+    if (Array.isArray(content)) {
+      return content.map(part => {
+        if (part.type === 'text') {
+          return part.text || ''
+        }
+        if (part.type === 'image_url') {
+          return '[image attached]'
+        }
+        return ''
+      }).filter(Boolean).join(' ')
+    }
+    return String(content)
+  }
+
   // Convert messages to Gemini format
-  const prompt = messages.map((m) => `${m.role}: ${m.content}`).join("\n\n")
+  const prompt = messages.map((m) => {
+    const content = extractTextFromContent(m.content)
+    return `${m.role}: ${content}`
+  }).join("\n\n")
 
   return apiCallWithRetry(async () => {
     if (isDebugLLM()) {
@@ -1144,7 +1200,7 @@ async function makeGeminiCall(
  * This is wrapped by makeLLMCallWithFetch with retry logic
  */
 async function makeLLMCallAttempt(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   chatProviderId: string,
   onRetryProgress?: RetryProgressCallback,
   sessionId?: string,
@@ -1153,7 +1209,7 @@ async function makeLLMCallAttempt(
     logLLM("🚀 Starting LLM call attempt", {
       provider: chatProviderId,
       messagesCount: messages.length,
-      lastMessagePreview: messages[messages.length - 1]?.content?.substring(0, 100) + "..."
+      lastMessagePreview: getContentPreview(messages[messages.length - 1]?.content, 100)
     })
   }
 
@@ -1323,7 +1379,7 @@ async function makeLLMCallAttempt(
  * Main function to make LLM calls using fetch with automatic retry on empty responses
  */
 export async function makeLLMCallWithFetch(
-  messages: Array<{ role: string; content: string }>,
+  messages: Array<{ role: string; content: string | any[] }>,
   providerId?: string,
   onRetryProgress?: RetryProgressCallback,
   sessionId?: string,
diff --git a/apps/desktop/src/main/llm.ts b/apps/desktop/src/main/llm.ts
index e61eec6ae..5fc88be89 100644
--- a/apps/desktop/src/main/llm.ts
+++ b/apps/desktop/src/main/llm.ts
@@ -561,6 +561,7 @@ export async function processTranscriptWithAgentMode(
   conversationId?: string, // Conversation ID for linking to conversation history
   sessionId?: string, // Session ID for progress routing and isolation
   onProgress?: (update: AgentProgressUpdate) => void, // Optional callback for external progress consumers (e.g., SSE)
+  screenshot?: string, // Optional screenshot data URL for multimodal input
 ): Promise<AgentModeResponse> {
   const config = configStore.get()
 
@@ -778,9 +779,15 @@ export async function processTranscriptWithAgentMode(
     toolCalls?: MCPToolCall[]
     toolResults?: MCPToolResult[]
     timestamp?: number
+    screenshot?: string
   }> = [
     ...(previousConversationHistory || []),
-    { role: "user", content: transcript, timestamp: Date.now() },
+    {
+      role: "user",
+      content: transcript,
+      timestamp: Date.now(),
+      screenshot: screenshot
+    },
   ]
 
   logLLM(`[llm.ts processTranscriptWithAgentMode] conversationHistory initialized with ${conversationHistory.length} messages, roles: [${conversationHistory.map(m => m.role).join(', ')}]`)
@@ -881,7 +888,7 @@ export async function processTranscriptWithAgentMode(
   // Helper to map conversation history to LLM messages format (filters empty content)
   const mapConversationToMessages = (
     addSummaryPrompt: boolean = false
-  ): Array<{ role: "user" | "assistant"; content: string }> => {
+  ): Array<{ role: "user" | "assistant"; content: string | any[] }> => {
     const mapped = conversationHistory
       .map((entry) => {
         if (entry.role === "tool") {
@@ -891,9 +898,21 @@ export async function processTranscriptWithAgentMode(
         }
         const content = (entry.content || "").trim()
         if (!content) return null
+
+        // Handle multimodal content (text + screenshot)
+        if (entry.screenshot) {
+          return {
+            role: entry.role as "user" | "assistant",
+            content: [
+              { type: "text", text: entry.content },
+              { type: "image_url", image_url: { url: entry.screenshot, detail: "high" } }
+            ]
+          }
+        }
+
         return { role: entry.role as "user" | "assistant", content }
       })
-      .filter(Boolean) as Array<{ role: "user" | "assistant"; content: string }>
+      .filter(Boolean) as Array<{ role: "user" | "assistant"; content: string | any[] }>
 
     // Add summary prompt if last message is from assistant (ensures LLM has something to respond to)
     if (addSummaryPrompt && mapped.length > 0 && mapped[mapped.length - 1].role === "assistant") {
@@ -1178,8 +1197,8 @@ Always use actual resource IDs from the conversation history or create new ones
           // For assistant messages, ensure non-empty content
           // Anthropic API requires all messages to have non-empty content
           // except for the optional final assistant message
-          let content = entry.content
-          if (entry.role === "assistant" && !content?.trim()) {
+          let content: string | any[] = entry.content
+          if (entry.role === "assistant" && !entry.content?.trim()) {
             // If assistant message has tool calls but no content, describe the tool calls
             if (entry.toolCalls && entry.toolCalls.length > 0) {
               const toolNames = entry.toolCalls.map(tc => tc.name).join(", ")
@@ -1189,6 +1208,18 @@ Always use actual resource IDs from the conversation history or create new ones
               content = "[Processing...]"
             }
           }
+
+          // Handle multimodal content (text + screenshot)
+          if (entry.screenshot) {
+            return {
+              role: entry.role as "user" | "assistant",
+              content: [
+                { type: "text", text: entry.content },
+                { type: "image_url", image_url: { url: entry.screenshot, detail: "high" } }
+              ]
+            }
+          }
+
           return {
             role: entry.role as "user" | "assistant",
             content,
diff --git a/apps/desktop/src/main/renderer-handlers.ts b/apps/desktop/src/main/renderer-handlers.ts
index 0243f1618..1c1b606b2 100644
--- a/apps/desktop/src/main/renderer-handlers.ts
+++ b/apps/desktop/src/main/renderer-handlers.ts
@@ -9,9 +9,9 @@ export type RendererHandlers = {
   startOrFinishRecording: (data?: { fromButtonClick?: boolean }) => void
   refreshRecordingHistory: () => void
 
-  startMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean }) => void
+  startMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean; screenshot?: string }) => void
   finishMcpRecording: () => void
-  startOrFinishMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean }) => void
+  startOrFinishMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean; screenshot?: string }) => void
 
   showTextInput: () => void
   hideTextInput: () => void
diff --git a/apps/desktop/src/main/tipc.ts b/apps/desktop/src/main/tipc.ts
index 6e37828b6..df33e774e 100644
--- a/apps/desktop/src/main/tipc.ts
+++ b/apps/desktop/src/main/tipc.ts
@@ -149,9 +149,21 @@ async function processWithAgentMode(
   conversationId?: string,
   existingSessionId?: string, // Optional: reuse existing session instead of creating new one
   startSnoozed: boolean = false, // Whether to start session snoozed (default: false to show panel)
+  screenshot?: string, // Optional screenshot data URL for multimodal input
 ): Promise<string> {
   const config = configStore.get()
 
+  // Validate screenshot if provided
+  if (screenshot) {
+    if (!screenshot.startsWith('data:image/')) {
+      throw new Error('Invalid screenshot format: must be a data URL starting with data:image/')
+    }
+    const sizeInMB = (screenshot.length * 0.75) / (1024 * 1024)
+    if (sizeInMB > 10) {
+      throw new Error(`Screenshot too large: ${sizeInMB.toFixed(1)}MB (maximum 10MB)`)
+    }
+  }
+
   // NOTE: Don't clear all agent progress here - we support multiple concurrent sessions
   // Each session manages its own progress lifecycle independently
 
@@ -295,6 +307,8 @@ async function processWithAgentMode(
       previousConversationHistory,
       conversationId, // Pass conversation ID for linking to conversation history
       sessionId, // Pass session ID for progress routing and isolation
+      undefined, // onProgress callback (not used here, progress is emitted internally)
+      screenshot, // Pass screenshot data for multimodal input
     )
 
     // Mark session as completed
@@ -866,13 +880,14 @@ export const router = {
   }),
 
   triggerMcpRecording: t.procedure
-    .input<{ conversationId?: string; sessionId?: string; fromTile?: boolean }>()
+    .input<{ conversationId?: string; sessionId?: string; fromTile?: boolean; screenshot?: string }>()
     .action(async ({ input }) => {
       const { showPanelWindowAndStartMcpRecording } = await import("./window")
       // Always show the panel during recording for waveform feedback
       // The fromTile flag tells the panel to hide after recording ends
       // fromButtonClick=true indicates this was triggered via UI button (not keyboard shortcut)
-      await showPanelWindowAndStartMcpRecording(input.conversationId, input.sessionId, input.fromTile, true)
+      // screenshot is passed through to the renderer for multimodal input
+      await showPanelWindowAndStartMcpRecording(input.conversationId, input.sessionId, input.fromTile, true, input.screenshot)
     }),
 
   showMainWindow: t.procedure
@@ -1019,6 +1034,7 @@ export const router = {
   createTextInput: t.procedure
     .input<{
       text: string
+      screenshot?: string
     }>()
     .action(async ({ input }) => {
       const config = configStore.get()
@@ -1073,6 +1089,7 @@ export const router = {
       text: string
       conversationId?: string
       fromTile?: boolean // When true, session runs in background (snoozed) - panel won't show
+      screenshot?: string // Optional screenshot data URL for multimodal input
     }>()
     .action(async ({ input }) => {
       const config = configStore.get()
@@ -1128,7 +1145,7 @@ export const router = {
       // This allows multiple sessions to run concurrently
       // Pass existingSessionId to reuse the session if found
       // When fromTile=true, start snoozed so the floating panel doesn't appear
-      processWithAgentMode(input.text, conversationId, existingSessionId, input.fromTile ?? false)
+      processWithAgentMode(input.text, conversationId, existingSessionId, input.fromTile ?? false, input.screenshot)
         .then((finalResponse) => {
           // Save to history after completion
           const history = getRecordingHistory()
@@ -1161,7 +1178,7 @@ export const router = {
           }
         })
         .catch((error) => {
-          logLLM("[createMcpTextInput] Agent processing error:", error)
+          logApp("[createMcpTextInput] Agent processing error:", error)
         })
         .finally(() => {
           // Process queued messages after this session completes (success or error)
@@ -1182,6 +1199,7 @@ export const router = {
       conversationId?: string
       sessionId?: string
       fromTile?: boolean // When true, session runs in background (snoozed) - panel won't show
+      screenshot?: string // Optional screenshot data URL for multimodal input
     }>()
     .action(async ({ input }) => {
       fs.mkdirSync(recordingsFolder, { recursive: true })
@@ -1359,7 +1377,8 @@ export const router = {
         // Fire-and-forget: Start agent processing without blocking
         // This allows multiple sessions to run concurrently
         // Pass the sessionId to avoid creating a duplicate session
-        processWithAgentMode(transcript, conversationId, sessionId)
+        // Pass startSnoozed for tile behavior and screenshot for multimodal input
+        processWithAgentMode(transcript, conversationId, sessionId, startSnoozed, input.screenshot)
         .then((finalResponse) => {
           // Save to history after completion
           const history = getRecordingHistory()
@@ -2063,6 +2082,19 @@ export const router = {
     await shell.openPath(conversationsFolder)
   }),
 
+  // Display/Screen endpoints
+  getAvailableDisplays: t.procedure.action(async () => {
+    const { screen } = await import('electron')
+    const displays = screen.getAllDisplays()
+    const primaryDisplay = screen.getPrimaryDisplay()
+    return displays.map(d => ({
+      id: d.id.toString(),
+      label: d.label || `Display ${d.id}`,
+      bounds: d.bounds,
+      isPrimary: d.id === primaryDisplay.id
+    }))
+  }),
+
   // Panel resize endpoints
   getPanelSize: t.procedure.action(async () => {
     const win = WINDOWS.get("panel")
diff --git a/apps/desktop/src/main/window.ts b/apps/desktop/src/main/window.ts
index 0961e4285..31eabc1f3 100644
--- a/apps/desktop/src/main/window.ts
+++ b/apps/desktop/src/main/window.ts
@@ -4,6 +4,7 @@ import {
   shell,
   screen,
   app,
+  desktopCapturer,
 } from "electron"
 import path from "path"
 import { getRendererHandlers } from "@egoist/tipc/main"
@@ -17,6 +18,45 @@ import { setupConsoleLogger } from "./console-logger"
 
 type WINDOW_ID = "main" | "panel" | "setup"
 
+/**
+ * Capture a screenshot from the configured display (or primary display)
+ * Returns the screenshot as a data URL, or undefined if capture fails
+ */
+export async function captureScreenshotFromMain(): Promise<string | undefined> {
+  try {
+    const config = configStore.get()
+    const sources = await desktopCapturer.getSources({
+      types: ['screen'],
+      thumbnailSize: { width: 1920, height: 1080 }
+    })
+
+    if (sources.length === 0) {
+      if (process.platform === 'darwin') {
+        throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.')
+      } else {
+        throw new Error('No screen sources available')
+      }
+    }
+
+    // Find the source matching the configured display, or use the first one (primary)
+    const configuredDisplayId = config.screenshotDisplayId
+    let source = sources[0]
+    if (configuredDisplayId) {
+      const matchingSource = sources.find(s => s.display_id === configuredDisplayId)
+      if (matchingSource) {
+        source = matchingSource
+      }
+    }
+
+    const screenshot = source.thumbnail.toDataURL()
+    logApp(`[captureScreenshotFromMain] Captured screenshot from display: ${source.display_id}, size: ${screenshot.length} chars`)
+    return screenshot
+  } catch (error) {
+    logApp('[captureScreenshotFromMain] Failed to capture screenshot:', error)
+    throw error
+  }
+}
+
 export const WINDOWS = new Map<WINDOW_ID, BrowserWindow>()
 
 
@@ -433,7 +473,7 @@ export async function showPanelWindowAndStartRecording(fromButtonClick?: boolean
   getWindowRendererHandlers("panel")?.startRecording.send({ fromButtonClick })
 }
 
-export async function showPanelWindowAndStartMcpRecording(conversationId?: string, sessionId?: string, fromTile?: boolean, fromButtonClick?: boolean) {
+export async function showPanelWindowAndStartMcpRecording(conversationId?: string, sessionId?: string, fromTile?: boolean, fromButtonClick?: boolean, screenshot?: string) {
   // Capture focus before showing panel
   try {
     const focusedApp = await getFocusedAppInfo()
@@ -446,11 +486,25 @@ export async function showPanelWindowAndStartMcpRecording(conversationId?: strin
   state.isRecordingFromButtonClick = fromButtonClick ?? false
   state.isRecordingMcpMode = true
 
+  // Auto-capture screenshot if enabled for voice commands and no screenshot was explicitly passed
+  let effectiveScreenshot = screenshot
+  if (!effectiveScreenshot) {
+    const config = configStore.get()
+    if (config.screenshotForVoiceCommands) {
+      try {
+        effectiveScreenshot = await captureScreenshotFromMain()
+      } catch (error) {
+        // Log but continue - recording can proceed without screenshot
+        logApp('[showPanelWindowAndStartMcpRecording] Screenshot capture failed:', error)
+      }
+    }
+  }
+
   // Ensure consistent sizing by setting mode in main before showing
   setPanelMode("normal")
   showPanelWindow()
-  // Pass fromTile and fromButtonClick flags so panel knows how to behave after recording ends
-  getWindowRendererHandlers("panel")?.startMcpRecording.send({ conversationId, sessionId, fromTile, fromButtonClick })
+  // Pass fromTile, fromButtonClick, and screenshot flags so panel knows how to behave after recording ends
+  getWindowRendererHandlers("panel")?.startMcpRecording.send({ conversationId, sessionId, fromTile, fromButtonClick, screenshot: effectiveScreenshot })
 }
 
 export async function showPanelWindowAndShowTextInput() {
diff --git a/apps/desktop/src/preload/index.d.ts b/apps/desktop/src/preload/index.d.ts
index 8b2bd139d..b4d6400b3 100644
--- a/apps/desktop/src/preload/index.d.ts
+++ b/apps/desktop/src/preload/index.d.ts
@@ -1,5 +1,13 @@
 import { ElectronAPI } from "@electron-toolkit/preload"
 
+interface ScreenSource {
+  id: string
+  name: string
+  thumbnail: string  // Data URL
+  display_id: string
+  appIcon: string | null  // Data URL or null
+}
+
 declare global {
   interface Window {
     electron: ElectronAPI
@@ -9,6 +17,7 @@ declare global {
       getOAuthStatus: (serverName: string) => Promise<{ configured: boolean; authenticated: boolean; tokenExpiry?: number; error?: string }>
       revokeOAuthTokens: (serverName: string) => Promise<{ success: boolean; error?: string }>
       testMCPServer: (serverName: string, config: any) => Promise<{ success: boolean; error?: string }>
+      getScreenSources: (options: { types: string[], thumbnailSize?: { width: number, height: number } }) => Promise<ScreenSource[]>
     }
   }
 }
diff --git a/apps/desktop/src/preload/index.ts b/apps/desktop/src/preload/index.ts
index 2757b1112..026c31b7d 100644
--- a/apps/desktop/src/preload/index.ts
+++ b/apps/desktop/src/preload/index.ts
@@ -10,6 +10,9 @@ const api = {
   getOAuthStatus: (serverName: string) => ipcRenderer.invoke('getOAuthStatus', serverName),
   revokeOAuthTokens: (serverName: string) => ipcRenderer.invoke('revokeOAuthTokens', serverName),
   testMCPServer: (serverName: string, config: any) => ipcRenderer.invoke('testMCPServer', { serverName, config }),
+  // Screenshot API - uses IPC to main process (desktopCapturer is only available in main process in Electron 31+)
+  getScreenSources: (options: { types: string[], thumbnailSize?: { width: number, height: number } }) =>
+    ipcRenderer.invoke('getScreenSources', options)
 }
 
 if (process.contextIsolated) {
diff --git a/apps/desktop/src/renderer/src/components/text-input-panel.tsx b/apps/desktop/src/renderer/src/components/text-input-panel.tsx
index 8fb72b989..854cc2b04 100644
--- a/apps/desktop/src/renderer/src/components/text-input-panel.tsx
+++ b/apps/desktop/src/renderer/src/components/text-input-panel.tsx
@@ -4,9 +4,19 @@ import { cn } from "@renderer/lib/utils"
 import { AgentProcessingView } from "./agent-processing-view"
 import { AgentProgressUpdate } from "../../../shared/types"
 import { useTheme } from "@renderer/contexts/theme-context"
+import { Camera, Eye } from "lucide-react"
+import { useConfigQuery } from "@renderer/lib/query-client"
+import {
+  Dialog,
+  DialogContent,
+  DialogHeader,
+  DialogTitle,
+  DialogDescription,
+} from "./ui/dialog"
+import { Button } from "./ui/button"
 
 interface TextInputPanelProps {
-  onSubmit: (text: string) => void
+  onSubmit: (text: string, screenshot?: string) => void
   onCancel: () => void
   isProcessing?: boolean
   agentProgress?: AgentProgressUpdate | null
@@ -22,10 +32,26 @@ export const TextInputPanel = forwardRef<TextInputPanelRef, TextInputPanelProps>
   isProcessing = false,
   agentProgress,
 }, ref) => {
+  const configQuery = useConfigQuery()
+  const alwaysIncludeScreenshot = configQuery.data?.alwaysIncludeScreenshot ?? false
+
   const [text, setText] = useState("")
+  const [includeScreenshot, setIncludeScreenshot] = useState(alwaysIncludeScreenshot)
+  const [screenshot, setScreenshot] = useState<string | null>(null)
+  const [isCapturingScreenshot, setIsCapturingScreenshot] = useState(false)
+  const [screenshotError, setScreenshotError] = useState<string | null>(null)
+  const [previewOpen, setPreviewOpen] = useState(false)
+  const [previewImage, setPreviewImage] = useState<string | null>(null)
+  const [previewImageInfo, setPreviewImageInfo] = useState<{ width: number; height: number; size: string } | null>(null)
   const textareaRef = useRef<HTMLTextAreaElement>(null)
+  const captureWantedRef = useRef(false)
   const { isDark } = useTheme()
 
+  // Sync includeScreenshot state when config loads or alwaysIncludeScreenshot setting changes
+  useEffect(() => {
+    setIncludeScreenshot(alwaysIncludeScreenshot)
+  }, [alwaysIncludeScreenshot])
+
   useImperativeHandle(ref, () => ({
     focus: () => {
       textareaRef.current?.focus()
@@ -52,13 +78,147 @@ export const TextInputPanel = forwardRef<TextInputPanelRef, TextInputPanelProps>
     return undefined
   }, [isProcessing])
 
+  // Helper to find the correct source based on configured display ID
+  const findSourceByDisplayId = (sources: Array<{ id: string, name: string, thumbnail: string, display_id: string }>, configuredDisplayId: string | undefined) => {
+    if (!configuredDisplayId || configuredDisplayId === '') {
+      // No configured display ID, use first source (primary display)
+      return sources[0]
+    }
+    // Find source matching the configured display_id
+    const matchingSource = sources.find(s => s.display_id === configuredDisplayId)
+    if (matchingSource) {
+      console.log('[TextInputPanel] Found matching source for display_id:', configuredDisplayId)
+      return matchingSource
+    }
+    // Fall back to first source if configured display not found
+    console.log('[TextInputPanel] Configured display_id not found, falling back to primary:', configuredDisplayId)
+    return sources[0]
+  }
+
+  const captureScreenshot = async () => {
+    console.log('[TextInputPanel] captureScreenshot called')
+    setIsCapturingScreenshot(true)
+    setScreenshotError(null)
+    try {
+      // Use IPC to get screen sources from main process (desktopCapturer is only available in main process in Electron 31+)
+      console.log('[TextInputPanel] Calling getScreenSources...')
+      const sources = await (window as any).electronAPI.getScreenSources({
+        types: ['screen'],
+        thumbnailSize: { width: 1920, height: 1080 }
+      })
+      console.log('[TextInputPanel] Got sources:', sources?.length || 0)
+
+      // Check if screenshot is still wanted after async operation completes
+      if (!captureWantedRef.current) {
+        console.log('[TextInputPanel] Screenshot no longer wanted, discarding')
+        return
+      }
+
+      if (sources && sources.length > 0) {
+        // Get the source matching the configured display, or fallback to primary
+        const configuredDisplayId = configQuery.data?.screenshotDisplayId
+        const source = findSourceByDisplayId(sources, configuredDisplayId)
+        const screenshot = source.thumbnail
+        console.log('[TextInputPanel] Screenshot captured from display_id:', source.display_id, 'length:', screenshot?.length || 0)
+        setScreenshot(screenshot)
+      } else {
+        console.log('[TextInputPanel] No sources returned')
+      }
+    } catch (error: any) {
+      console.error('[TextInputPanel] Failed to capture screenshot:', error)
+      // Show the actual error message if available (e.g., permission error on macOS)
+      const errorMessage = error?.message || 'Failed to capture screenshot'
+      setScreenshotError(errorMessage)
+      setIncludeScreenshot(false)
+    } finally {
+      setIsCapturingScreenshot(false)
+    }
+  }
+
+  const handlePreviewScreenshot = async () => {
+    try {
+      let imageToPreview: string
+
+      // Use existing screenshot if available, otherwise capture a fresh one
+      if (screenshot) {
+        // Use the existing screenshot state - this is what will actually be sent
+        imageToPreview = screenshot
+      } else {
+        // No screenshot exists yet, capture one and update state so preview matches what will be sent
+        const sources = await (window as any).electronAPI.getScreenSources({
+          types: ['screen'],
+          thumbnailSize: { width: 1920, height: 1080 }
+        })
+
+        if (!sources || sources.length === 0) {
+          console.error('[TextInputPanel] No sources available for preview')
+          return
+        }
+
+        // Get the source matching the configured display, or fallback to primary
+        const configuredDisplayId = configQuery.data?.screenshotDisplayId
+        const source = findSourceByDisplayId(sources, configuredDisplayId)
+        imageToPreview = source.thumbnail as string
+
+        // Update the screenshot state so it matches what we're previewing
+        setScreenshot(imageToPreview)
+      }
+
+      setPreviewImage(imageToPreview)
+
+      // Calculate image info
+      const img = new Image()
+      img.onload = () => {
+        // Calculate approximate size of base64 data
+        const base64Length = imageToPreview.length - (imageToPreview.indexOf(',') + 1)
+        const sizeInBytes = Math.ceil(base64Length * 0.75)
+        const sizeInKB = (sizeInBytes / 1024).toFixed(1)
+        const sizeStr = sizeInBytes > 1024 * 1024
+          ? `${(sizeInBytes / (1024 * 1024)).toFixed(2)} MB`
+          : `${sizeInKB} KB`
+
+        setPreviewImageInfo({
+          width: img.naturalWidth,
+          height: img.naturalHeight,
+          size: sizeStr
+        })
+      }
+      img.src = imageToPreview
+
+      setPreviewOpen(true)
+    } catch (error) {
+      console.error('[TextInputPanel] Failed to capture preview screenshot:', error)
+    }
+  }
+
   const handleSubmit = () => {
     if (text.trim() && !isProcessing) {
-      onSubmit(text.trim())
+      // Only include screenshot if the checkbox is still checked
+      const screenshotToSend = includeScreenshot && screenshot ? screenshot : undefined
+      console.log('[TextInputPanel] handleSubmit called, screenshot:', screenshotToSend ? `${screenshotToSend.length} chars` : 'none')
+      onSubmit(text.trim(), screenshotToSend)
       setText("")
+      setScreenshot(null)
+      setIncludeScreenshot(alwaysIncludeScreenshot)
     }
   }
 
+  // Capture screenshot when checkbox is toggled on, clear when toggled off
+  useEffect(() => {
+    if (includeScreenshot) {
+      captureWantedRef.current = true
+      if (!screenshot) {
+        captureScreenshot()
+      }
+    } else {
+      captureWantedRef.current = false
+      if (screenshot) {
+        // Clear screenshot when user unchecks the box
+        setScreenshot(null)
+      }
+    }
+  }, [includeScreenshot, screenshot])
+
   const handleKeyDown = (e: React.KeyboardEvent) => {
     const isModifierPressed = e.metaKey || e.ctrlKey;
 
@@ -134,6 +294,43 @@ export const TextInputPanel = forwardRef<TextInputPanelRef, TextInputPanelProps>
             disabled={isProcessing}
             aria-label="Message input"
           />
+
+          {/* Screenshot option */}
+          <div className="flex items-center gap-2">
+            <label className="flex items-center gap-2 cursor-pointer text-xs modern-text-muted hover:modern-text-strong transition-colors">
+              <input
+                type="checkbox"
+                checked={includeScreenshot}
+                onChange={(e) => setIncludeScreenshot(e.target.checked)}
+                disabled={isProcessing || isCapturingScreenshot}
+                className="h-3 w-3 rounded border-gray-300"
+              />
+              <Camera className="h-3 w-3" />
+              <span>Include screenshot</span>
+            </label>
+            {includeScreenshot && (
+              <Button
+                variant="ghost"
+                size="sm"
+                onClick={handlePreviewScreenshot}
+                title="Preview screenshot"
+                className="h-6 px-2"
+              >
+                <Eye className="h-3 w-3" />
+              </Button>
+            )}
+            {isCapturingScreenshot && (
+              <span className="text-xs modern-text-muted">Capturing...</span>
+            )}
+            {screenshot && !isCapturingScreenshot && (
+              <span className="text-xs text-green-500">✓ Screenshot captured</span>
+            )}
+            {screenshotError && !isCapturingScreenshot && (
+              <span className="text-xs text-red-500" title={screenshotError}>
+                ✗ {screenshotError.includes('Screen Recording') ? 'Screen Recording permission required' : screenshotError}
+              </span>
+            )}
+          </div>
         </div>
       )}
 
@@ -167,6 +364,32 @@ export const TextInputPanel = forwardRef<TextInputPanelRef, TextInputPanelProps>
           </button>
         </div>
       </div>
+
+      {/* Screenshot Preview Dialog */}
+      <Dialog open={previewOpen} onOpenChange={setPreviewOpen}>
+        <DialogContent className="max-w-[90vw] max-h-[90vh] w-auto">
+          <DialogHeader>
+            <DialogTitle>Screenshot Preview</DialogTitle>
+            <DialogDescription>
+              This is what will be sent with your message
+              {previewImageInfo && (
+                <span className="ml-2 text-xs">
+                  ({previewImageInfo.width} × {previewImageInfo.height}, {previewImageInfo.size})
+                </span>
+              )}
+            </DialogDescription>
+          </DialogHeader>
+          <div className="flex justify-center items-center overflow-auto max-h-[70vh]">
+            {previewImage && (
+              <img
+                src={previewImage}
+                alt="Screenshot preview"
+                className="max-w-full max-h-[65vh] object-contain rounded-md border border-border"
+              />
+            )}
+          </div>
+        </DialogContent>
+      </Dialog>
     </div>
   )
 })
diff --git a/apps/desktop/src/renderer/src/pages/panel.tsx b/apps/desktop/src/renderer/src/pages/panel.tsx
index acbf9dbf1..f156972f3 100644
--- a/apps/desktop/src/renderer/src/pages/panel.tsx
+++ b/apps/desktop/src/renderer/src/pages/panel.tsx
@@ -38,6 +38,7 @@ export function Component() {
   const mcpConversationIdRef = useRef<string | undefined>(undefined)
   const mcpSessionIdRef = useRef<string | undefined>(undefined)
   const fromTileRef = useRef<boolean>(false)
+  const mcpScreenshotRef = useRef<string | undefined>(undefined)
   const [fromButtonClick, setFromButtonClick] = useState(false)
   const { isDark } = useTheme()
   const lastRequestedModeRef = useRef<"normal" | "agent" | "textInput">("normal")
@@ -225,11 +226,13 @@ export function Component() {
       const conversationIdForMcp = mcpConversationIdRef.current ?? currentConversationId
       const sessionIdForMcp = mcpSessionIdRef.current
       const wasFromTile = fromTileRef.current
+      const screenshotForMcp = mcpScreenshotRef.current
 
       // Clear the refs after capturing to avoid reusing stale IDs
       mcpConversationIdRef.current = undefined
       mcpSessionIdRef.current = undefined
       fromTileRef.current = false
+      mcpScreenshotRef.current = undefined
 
       // If recording was from a tile, hide the floating panel immediately
       // The session will continue in the tile view
@@ -251,6 +254,8 @@ export function Component() {
         sessionId: sessionIdForMcp,
         // Pass fromTile so session starts snoozed when recording was from a tile
         fromTile: wasFromTile,
+        // Pass screenshot for multimodal input if provided
+        screenshot: screenshotForMcp,
       })
 
       // NOTE: Do NOT call continueConversation here!
@@ -281,8 +286,8 @@ export function Component() {
   })
 
   const textInputMutation = useMutation({
-    mutationFn: async ({ text }: { text: string }) => {
-      await tipcClient.createTextInput({ text })
+    mutationFn: async ({ text, screenshot }: { text: string; screenshot?: string }) => {
+      await tipcClient.createTextInput({ text, screenshot })
     },
     onError(error) {
       setShowTextInput(false)
@@ -307,11 +312,13 @@ export function Component() {
     mutationFn: async ({
       text,
       conversationId,
+      screenshot,
     }: {
       text: string
       conversationId?: string
+      screenshot?: string
     }) => {
-      const result = await tipcClient.createMcpTextInput({ text, conversationId })
+      const result = await tipcClient.createMcpTextInput({ text, conversationId, screenshot })
 
       // NOTE: Do NOT call continueConversation here!
       // The currentConversationId should only be set through explicit user actions
@@ -506,7 +513,7 @@ export function Component() {
     return unlisten
   }, [])
 
-  const handleTextSubmit = async (text: string) => {
+  const handleTextSubmit = async (text: string, screenshot?: string) => {
     // Capture the conversation ID at submit time - if user explicitly continued a conversation
     // from history, currentConversationId will be set. Otherwise it's null for new inputs.
     const conversationIdForMcp = currentConversationId
@@ -523,13 +530,14 @@ export function Component() {
     // Ensure main process no longer treats panel as textInput mode
     tipcClient.clearTextInputState({})
 
-    // Always use MCP processing
+    // Always use MCP processing with optional screenshot
     mcpTextInputMutation.mutate({
       text,
       // Pass currentConversationId if user explicitly continued from history,
       // otherwise undefined to create a fresh conversation.
       // This prevents message leaking while still supporting explicit continuation.
       conversationId: conversationIdForMcp ?? undefined,
+      screenshot,
     })
   }
 
@@ -538,10 +546,11 @@ export function Component() {
   // MCP handlers
   useEffect(() => {
     const unlisten = rendererHandlers.startMcpRecording.listen((data) => {
-      // Store the conversationId, sessionId, and fromTile flag for use when recording ends
+      // Store the conversationId, sessionId, fromTile flag, and screenshot for use when recording ends
       mcpConversationIdRef.current = data?.conversationId
       mcpSessionIdRef.current = data?.sessionId
       fromTileRef.current = data?.fromTile ?? false
+      mcpScreenshotRef.current = data?.screenshot
       // Track if recording was triggered via UI button click vs keyboard shortcut
       // When true, we show "Enter" as the submit hint instead of "Release keys"
       setFromButtonClick(data?.fromButtonClick ?? false)
@@ -581,9 +590,10 @@ export function Component() {
         isConfirmedRef.current = true
         recorderRef.current?.stopRecording()
       } else {
-        // Store the conversationId and sessionId for use when recording ends
+        // Store the conversationId, sessionId, and screenshot for use when recording ends
         mcpConversationIdRef.current = data?.conversationId
         mcpSessionIdRef.current = data?.sessionId
+        mcpScreenshotRef.current = data?.screenshot
         // Track if recording was triggered via UI button click vs keyboard shortcut
         setFromButtonClick(data?.fromButtonClick ?? false)
         setMcpMode(true)
diff --git a/apps/desktop/src/renderer/src/pages/settings-general.tsx b/apps/desktop/src/renderer/src/pages/settings-general.tsx
index a6159b7e6..8bdaa51dc 100644
--- a/apps/desktop/src/renderer/src/pages/settings-general.tsx
+++ b/apps/desktop/src/renderer/src/pages/settings-general.tsx
@@ -104,6 +104,22 @@ export function Component() {
   const shortcut = (configQuery.data as any)?.shortcut || "hold-ctrl"
   const textInputShortcut = (configQuery.data as any)?.textInputShortcut || "ctrl-t"
 
+  // State for available displays
+  const [displays, setDisplays] = useState<Array<{
+    id: string
+    label: string
+    bounds: { x: number, y: number, width: number, height: number }
+    isPrimary: boolean
+  }>>([])
+
+  // Fetch available displays
+  useEffect(() => {
+    tipcClient.getAvailableDisplays().then((result: typeof displays) => {
+      setDisplays(result || [])
+    }).catch((err: unknown) => {
+      console.error('Failed to get displays:', err)
+    })
+  }, [])
 
   if (!configQuery.data) return null
 
@@ -353,6 +369,19 @@ export function Component() {
                     placeholder="Click to record custom text input shortcut"
                   />
                 )}
+
+              <div className="flex items-center gap-2">
+                <Switch
+                  checked={configQuery.data?.alwaysIncludeScreenshot ?? false}
+                  onCheckedChange={(checked) => {
+                    saveConfig({
+                      alwaysIncludeScreenshot: checked,
+                    })
+                  }}
+                  disabled={!configQuery.data?.textInputEnabled}
+                />
+                <span className="text-sm text-muted-foreground">Always include screenshot with messages</span>
+              </div>
             </div>
           </Control>
 
@@ -729,6 +758,39 @@ export function Component() {
 
         </ControlGroup>
 
+        {/* Screenshot Settings */}
+        <ControlGroup title="Screenshot">
+          <Control label={<ControlLabel label="Screenshot Display" tooltip="Select which display to capture when taking screenshots. Useful for multi-monitor setups." />} className="px-3">
+            <Select
+              value={configQuery.data?.screenshotDisplayId || "primary"}
+              onValueChange={(value) => {
+                saveConfig({
+                  screenshotDisplayId: value === "primary" ? "" : value,
+                })
+              }}
+            >
+              <SelectTrigger className="w-64">
+                <SelectValue placeholder="Primary Display" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="primary">Primary Display (default)</SelectItem>
+                {displays.map(d => (
+                  <SelectItem key={d.id} value={d.id}>
+                    {d.label || `Display ${d.id}`} {d.isPrimary ? "(Primary)" : ""} - {d.bounds.width}x{d.bounds.height}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </Control>
+
+          <Control label={<ControlLabel label="Screenshot for Voice Commands" tooltip="Automatically capture a screenshot when using voice commands (agent mode). The screenshot will be sent along with your voice input for context-aware responses." />} className="px-3">
+            <Switch
+              checked={configQuery.data?.screenshotForVoiceCommands ?? false}
+              onCheckedChange={(value) => saveConfig({ screenshotForVoiceCommands: value })}
+            />
+          </Control>
+        </ControlGroup>
+
         {/* Agent Settings */}
         <ControlGroup title="Agent Settings">
           <Control label={<ControlLabel label="Message Queuing" tooltip="Allow queueing messages while the agent is processing. Messages will be processed in order after the current task completes." />} className="px-3">
diff --git a/apps/desktop/src/shared/types.ts b/apps/desktop/src/shared/types.ts
index e4bd1caca..b3e7d1f9e 100644
--- a/apps/desktop/src/shared/types.ts
+++ b/apps/desktop/src/shared/types.ts
@@ -353,6 +353,10 @@ export type Config = {
   textInputShortcut?: "ctrl-t" | "ctrl-shift-t" | "alt-t" | "custom"
   customTextInputShortcut?: string
 
+  // Screenshot Configuration
+  alwaysIncludeScreenshot?: boolean
+  screenshotForVoiceCommands?: boolean // Auto-capture screenshot when using voice commands (MCP/agent mode)
+
   // Settings Window Hotkey Configuration
   settingsHotkeyEnabled?: boolean
   settingsHotkey?: "ctrl-shift-s" | "ctrl-comma" | "ctrl-shift-comma" | "custom"
@@ -468,6 +472,10 @@ export type Config = {
   streamStatusWatcherEnabled?: boolean
   streamStatusFilePath?: string
 
+  // Screenshot Display Configuration
+  // Store the preferred display ID for screenshot capture (empty = primary display)
+  screenshotDisplayId?: string
+
 }