diff --git a/apps/desktop/speakmcp-rs/Cargo.lock b/apps/desktop/speakmcp-rs/Cargo.lock index ce98e1740..7b037d323 100644 --- a/apps/desktop/speakmcp-rs/Cargo.lock +++ b/apps/desktop/speakmcp-rs/Cargo.lock @@ -431,7 +431,7 @@ dependencies = [ [[package]] name = "speakmcp-rs" -version = "1.0.0" +version = "1.1.0" dependencies = [ "enigo", "rdev", diff --git a/apps/desktop/src/main/config.ts b/apps/desktop/src/main/config.ts index 240520bd6..102bd3991 100644 --- a/apps/desktop/src/main/config.ts +++ b/apps/desktop/src/main/config.ts @@ -29,6 +29,8 @@ const getConfig = () => { mcpAutoPasteDelay: 1000, // 1 second delay by default mcpMaxIterations: 10, // Default max iterations for agent mode textInputEnabled: true, + alwaysIncludeScreenshot: false, + screenshotForVoiceCommands: false, // Auto-capture screenshot when using voice commands (MCP/agent mode) // Text input: On Windows, use Ctrl+Shift+T to avoid browser new tab conflict textInputShortcut: isWindows ? "ctrl-shift-t" : "ctrl-t", diff --git a/apps/desktop/src/main/context-budget.ts b/apps/desktop/src/main/context-budget.ts index 7c66e518a..0b72fb56c 100644 --- a/apps/desktop/src/main/context-budget.ts +++ b/apps/desktop/src/main/context-budget.ts @@ -4,7 +4,28 @@ import { makeTextCompletionWithFetch } from "./llm-fetch" import { constructMinimalSystemPrompt } from "./system-prompts" import { agentSessionStateManager } from "./state" -export type LLMMessage = { role: string; content: string } +export type LLMMessage = { role: string; content: string | any[] } + +// Helper function to get content length that handles both string and array +function getContentLength(content: string | any[] | undefined): number { + if (!content) return 0 + if (typeof content === 'string') return content.length + // For multimodal content, sum up text parts only + return content + .filter((part: any) => part.type === 'text') + .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0) +} + +// Helper function to get content as string for summarization +function getContentAsString(content: string | any[] | undefined): string { + if (!content) return '' + if (typeof content === 'string') return content + // For multimodal content, extract text parts only + return content + .filter((part: any) => part.type === 'text') + .map((part: any) => part.text || '') + .join('\n') +} // Simple in-memory cache for provider/model context windows const contextWindowCache = new Map() @@ -77,7 +98,7 @@ async function getMaxContextTokens(providerId: string, model: string): Promise sum + (m.content?.length || 0), 0) + const totalChars = messages.reduce((sum, m) => sum + getContentLength(m.content), 0) return Math.ceil(totalChars / 4) } @@ -196,14 +217,15 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa const AGGRESSIVE_TRUNCATE_THRESHOLD = 5000 for (let i = 0; i < messages.length; i++) { const msg = messages[i] - if (msg.role === "user" && msg.content && msg.content.length > AGGRESSIVE_TRUNCATE_THRESHOLD) { + if (msg.role === "user" && getContentLength(msg.content) > AGGRESSIVE_TRUNCATE_THRESHOLD) { // Check if this looks like a tool result (contains JSON arrays/objects) - if (msg.content.includes('"url":') || msg.content.includes('"id":')) { + const contentStr = getContentAsString(msg.content) + if (contentStr.includes('"url":') || contentStr.includes('"id":')) { // Truncate aggressively and add note messages[i] = { ...msg, - content: msg.content.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) + - '\n\n... (truncated ' + (msg.content.length - AGGRESSIVE_TRUNCATE_THRESHOLD) + + content: contentStr.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) + + '\n\n... (truncated ' + (contentStr.length - AGGRESSIVE_TRUNCATE_THRESHOLD) + ' characters for context management. Key information preserved above.)' } applied.push("aggressive_truncate") @@ -218,7 +240,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa // Tier 1: Summarize large messages (prefer tool outputs or very long entries) const indicesByLength = messages - .map((m, i) => ({ i, len: m.content?.length || 0, role: m.role, content: m.content })) + .map((m, i) => ({ i, len: getContentLength(m.content), role: m.role, content: m.content })) .filter((x) => x.len > summarizeThreshold && x.role !== "system") .sort((a, b) => b.len - a.len) @@ -234,7 +256,8 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa // Emit progress update before summarization summarizedCount++ if (opts.onSummarizationProgress) { - const messagePreview = item.content!.substring(0, 100).replace(/\n/g, ' ') + const contentStr = getContentAsString(item.content) + const messagePreview = contentStr.substring(0, 100).replace(/\n/g, ' ') opts.onSummarizationProgress( summarizedCount, totalToSummarize, @@ -242,7 +265,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa ) } - const summarized = await summarizeContent(item.content!, opts.sessionId) + const summarized = await summarizeContent(getContentAsString(item.content), opts.sessionId) messages[item.i] = { ...messages[item.i], content: summarized } applied.push("summarize") tokens = estimateTokensFromMessages(messages) diff --git a/apps/desktop/src/main/index.ts b/apps/desktop/src/main/index.ts index 617386b4a..3c886f670 100644 --- a/apps/desktop/src/main/index.ts +++ b/apps/desktop/src/main/index.ts @@ -1,4 +1,4 @@ -import { app, Menu } from "electron" +import { app, Menu, ipcMain, desktopCapturer } from "electron" import { electronApp, optimizer } from "@electron-toolkit/utils" import { createMainWindow, @@ -42,6 +42,54 @@ app.whenReady().then(() => { registerIpcMain(router) logApp("IPC main registered") + // Register desktopCapturer handler (available only in main process in Electron 31+) + ipcMain.handle('getScreenSources', async (_event, options: { types: ('screen' | 'window')[], thumbnailSize?: { width: number, height: number } }) => { + try { + // Validate and sanitize options + const validatedOptions = { + // Only allow 'screen' type for privacy - filter out 'window' + types: (options.types || ['screen']).filter(t => t === 'screen') as ('screen' | 'window')[], + thumbnailSize: { + // Clamp dimensions to reasonable bounds + width: Math.min(Math.max(options.thumbnailSize?.width || 1920, 100), 4096), + height: Math.min(Math.max(options.thumbnailSize?.height || 1080, 100), 4096) + } + } + + // Ensure at least 'screen' type is present + if (validatedOptions.types.length === 0) { + validatedOptions.types = ['screen'] + } + + logApp('[getScreenSources] Capturing screen sources with validated options:', JSON.stringify(validatedOptions)) + const sources = await desktopCapturer.getSources(validatedOptions) + logApp(`[getScreenSources] Got ${sources.length} sources`) + + // On macOS, if Screen Recording permission is not granted, desktopCapturer returns an empty array + // This is a silent failure - no error is thrown + if (sources.length === 0 && process.platform === 'darwin') { + throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.') + } + + // Serialize the sources - NativeImage thumbnail needs to be converted + const serialized = sources.map(source => { + const thumbnailDataUrl = source.thumbnail.toDataURL() + logApp(`[getScreenSources] Source: ${source.name}, thumbnail size: ${thumbnailDataUrl.length} chars`) + return { + id: source.id, + name: source.name, + thumbnail: thumbnailDataUrl, + display_id: source.display_id, + appIcon: source.appIcon ? source.appIcon.toDataURL() : null + } + }) + return serialized + } catch (error) { + console.error('Failed to get screen sources:', error) + throw error + } + }) + registerServeProtocol() try { diff --git a/apps/desktop/src/main/llm-fetch.ts b/apps/desktop/src/main/llm-fetch.ts index 518f6e6cb..0c6603a39 100644 --- a/apps/desktop/src/main/llm-fetch.ts +++ b/apps/desktop/src/main/llm-fetch.ts @@ -5,6 +5,38 @@ import { isDebugLLM, logLLM } from "./debug" import { state, llmRequestAbortManager, agentSessionStateManager } from "./state" import OpenAI from "openai" +/** + * Helper function to get a string preview from content that may be a string or multimodal array + */ +function getContentPreview(content: string | any[] | undefined, maxLength: number = 100): string { + if (!content) return "(empty)" + if (typeof content === "string") { + return content.length > maxLength ? content.substring(0, maxLength) + "..." : content + } + // It's an array (multimodal content) + const textParts = content + .filter((part: any) => part.type === "text") + .map((part: any) => part.text) + .join(" ") + const hasImage = content.some((part: any) => part.type === "image_url") + const preview = textParts.length > maxLength ? textParts.substring(0, maxLength) + "..." : textParts + return hasImage ? `[image] ${preview}` : preview +} + +/** + * Helper function to get content length from content that may be a string or multimodal array + */ +function getContentLength(content: string | any[] | undefined): number { + if (!content) return 0 + if (typeof content === "string") { + return content.length + } + // It's an array (multimodal content) - sum up text lengths + return content + .filter((part: any) => part.type === "text") + .reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0) +} + /** * Callback for reporting retry progress to the UI */ @@ -658,23 +690,22 @@ async function makeAPICallAttempt( messagesCount: requestBody.messages.length, responseFormat: requestBody.response_format, estimatedTokens, - totalPromptLength: (requestBody.messages as Array<{ role: string; content: string }>).reduce( - (sum: number, msg: { role: string; content: string }) => sum + ((msg.content?.length) || 0), + totalPromptLength: (requestBody.messages as Array<{ role: string; content: string | any[] }>).reduce( + (sum: number, msg: { role: string; content: string | any[] }) => sum + getContentLength(msg.content), 0, ), contextWarning: estimatedTokens > 8000 ? "WARNING: High token count, may exceed context limit" : null }) logLLM("Request Body (truncated)", { ...requestBody, - messages: (requestBody.messages as Array<{ role: string; content: string }>).map( - (msg: { role: string; content: string }) => ({ + messages: (requestBody.messages as Array<{ role: string; content: string | any[] }>).map( + (msg: { role: string; content: string | any[] }) => ({ role: msg.role, - content: msg.content.length > 200 - ? msg.content.substring(0, 200) + "... [" + msg.content.length + " chars]" - : msg.content, + content: getContentPreview(msg.content, 200), }), ) }) + } // Create abort controller and register it so emergency stop can cancel @@ -841,7 +872,7 @@ async function makeAPICallAttempt( * Make a fetch-based LLM call for OpenAI-compatible APIs with structured output fallback */ async function makeOpenAICompatibleCall( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: string | any[] }>, providerId: string, useStructuredOutput: boolean = true, sessionId?: string, @@ -861,7 +892,10 @@ async function makeOpenAICompatibleCall( } const model = getModel(providerId, "mcp") - const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => sum + msg.content.length, 0) / 4) + const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => { + const contentLength = typeof msg.content === 'string' ? msg.content.length : JSON.stringify(msg.content).length + return sum + contentLength + }, 0) / 4) const baseRequestBody = { model, @@ -1016,7 +1050,7 @@ async function makeOpenAICompatibleCall( * Make a fetch-based LLM call for Gemini API */ async function makeGeminiCall( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: string | any[] }>, sessionId?: string, onRetryProgress?: RetryProgressCallback, ): Promise { @@ -1030,8 +1064,30 @@ async function makeGeminiCall( const baseURL = config.geminiBaseUrl || "https://generativelanguage.googleapis.com" + // Helper to extract text from multimodal content without embedding full image data + const extractTextFromContent = (content: string | any[]): string => { + if (typeof content === 'string') { + return content + } + if (Array.isArray(content)) { + return content.map(part => { + if (part.type === 'text') { + return part.text || '' + } + if (part.type === 'image_url') { + return '[image attached]' + } + return '' + }).filter(Boolean).join(' ') + } + return String(content) + } + // Convert messages to Gemini format - const prompt = messages.map((m) => `${m.role}: ${m.content}`).join("\n\n") + const prompt = messages.map((m) => { + const content = extractTextFromContent(m.content) + return `${m.role}: ${content}` + }).join("\n\n") return apiCallWithRetry(async () => { if (isDebugLLM()) { @@ -1144,7 +1200,7 @@ async function makeGeminiCall( * This is wrapped by makeLLMCallWithFetch with retry logic */ async function makeLLMCallAttempt( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: string | any[] }>, chatProviderId: string, onRetryProgress?: RetryProgressCallback, sessionId?: string, @@ -1153,7 +1209,7 @@ async function makeLLMCallAttempt( logLLM("🚀 Starting LLM call attempt", { provider: chatProviderId, messagesCount: messages.length, - lastMessagePreview: messages[messages.length - 1]?.content?.substring(0, 100) + "..." + lastMessagePreview: getContentPreview(messages[messages.length - 1]?.content, 100) }) } @@ -1323,7 +1379,7 @@ async function makeLLMCallAttempt( * Main function to make LLM calls using fetch with automatic retry on empty responses */ export async function makeLLMCallWithFetch( - messages: Array<{ role: string; content: string }>, + messages: Array<{ role: string; content: string | any[] }>, providerId?: string, onRetryProgress?: RetryProgressCallback, sessionId?: string, diff --git a/apps/desktop/src/main/llm.ts b/apps/desktop/src/main/llm.ts index e61eec6ae..5fc88be89 100644 --- a/apps/desktop/src/main/llm.ts +++ b/apps/desktop/src/main/llm.ts @@ -561,6 +561,7 @@ export async function processTranscriptWithAgentMode( conversationId?: string, // Conversation ID for linking to conversation history sessionId?: string, // Session ID for progress routing and isolation onProgress?: (update: AgentProgressUpdate) => void, // Optional callback for external progress consumers (e.g., SSE) + screenshot?: string, // Optional screenshot data URL for multimodal input ): Promise { const config = configStore.get() @@ -778,9 +779,15 @@ export async function processTranscriptWithAgentMode( toolCalls?: MCPToolCall[] toolResults?: MCPToolResult[] timestamp?: number + screenshot?: string }> = [ ...(previousConversationHistory || []), - { role: "user", content: transcript, timestamp: Date.now() }, + { + role: "user", + content: transcript, + timestamp: Date.now(), + screenshot: screenshot + }, ] logLLM(`[llm.ts processTranscriptWithAgentMode] conversationHistory initialized with ${conversationHistory.length} messages, roles: [${conversationHistory.map(m => m.role).join(', ')}]`) @@ -881,7 +888,7 @@ export async function processTranscriptWithAgentMode( // Helper to map conversation history to LLM messages format (filters empty content) const mapConversationToMessages = ( addSummaryPrompt: boolean = false - ): Array<{ role: "user" | "assistant"; content: string }> => { + ): Array<{ role: "user" | "assistant"; content: string | any[] }> => { const mapped = conversationHistory .map((entry) => { if (entry.role === "tool") { @@ -891,9 +898,21 @@ export async function processTranscriptWithAgentMode( } const content = (entry.content || "").trim() if (!content) return null + + // Handle multimodal content (text + screenshot) + if (entry.screenshot) { + return { + role: entry.role as "user" | "assistant", + content: [ + { type: "text", text: entry.content }, + { type: "image_url", image_url: { url: entry.screenshot, detail: "high" } } + ] + } + } + return { role: entry.role as "user" | "assistant", content } }) - .filter(Boolean) as Array<{ role: "user" | "assistant"; content: string }> + .filter(Boolean) as Array<{ role: "user" | "assistant"; content: string | any[] }> // Add summary prompt if last message is from assistant (ensures LLM has something to respond to) if (addSummaryPrompt && mapped.length > 0 && mapped[mapped.length - 1].role === "assistant") { @@ -1178,8 +1197,8 @@ Always use actual resource IDs from the conversation history or create new ones // For assistant messages, ensure non-empty content // Anthropic API requires all messages to have non-empty content // except for the optional final assistant message - let content = entry.content - if (entry.role === "assistant" && !content?.trim()) { + let content: string | any[] = entry.content + if (entry.role === "assistant" && !entry.content?.trim()) { // If assistant message has tool calls but no content, describe the tool calls if (entry.toolCalls && entry.toolCalls.length > 0) { const toolNames = entry.toolCalls.map(tc => tc.name).join(", ") @@ -1189,6 +1208,18 @@ Always use actual resource IDs from the conversation history or create new ones content = "[Processing...]" } } + + // Handle multimodal content (text + screenshot) + if (entry.screenshot) { + return { + role: entry.role as "user" | "assistant", + content: [ + { type: "text", text: entry.content }, + { type: "image_url", image_url: { url: entry.screenshot, detail: "high" } } + ] + } + } + return { role: entry.role as "user" | "assistant", content, diff --git a/apps/desktop/src/main/renderer-handlers.ts b/apps/desktop/src/main/renderer-handlers.ts index 0243f1618..1c1b606b2 100644 --- a/apps/desktop/src/main/renderer-handlers.ts +++ b/apps/desktop/src/main/renderer-handlers.ts @@ -9,9 +9,9 @@ export type RendererHandlers = { startOrFinishRecording: (data?: { fromButtonClick?: boolean }) => void refreshRecordingHistory: () => void - startMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean }) => void + startMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean; screenshot?: string }) => void finishMcpRecording: () => void - startOrFinishMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean }) => void + startOrFinishMcpRecording: (data?: { conversationId?: string; sessionId?: string; fromTile?: boolean; fromButtonClick?: boolean; screenshot?: string }) => void showTextInput: () => void hideTextInput: () => void diff --git a/apps/desktop/src/main/tipc.ts b/apps/desktop/src/main/tipc.ts index 6e37828b6..df33e774e 100644 --- a/apps/desktop/src/main/tipc.ts +++ b/apps/desktop/src/main/tipc.ts @@ -149,9 +149,21 @@ async function processWithAgentMode( conversationId?: string, existingSessionId?: string, // Optional: reuse existing session instead of creating new one startSnoozed: boolean = false, // Whether to start session snoozed (default: false to show panel) + screenshot?: string, // Optional screenshot data URL for multimodal input ): Promise { const config = configStore.get() + // Validate screenshot if provided + if (screenshot) { + if (!screenshot.startsWith('data:image/')) { + throw new Error('Invalid screenshot format: must be a data URL starting with data:image/') + } + const sizeInMB = (screenshot.length * 0.75) / (1024 * 1024) + if (sizeInMB > 10) { + throw new Error(`Screenshot too large: ${sizeInMB.toFixed(1)}MB (maximum 10MB)`) + } + } + // NOTE: Don't clear all agent progress here - we support multiple concurrent sessions // Each session manages its own progress lifecycle independently @@ -295,6 +307,8 @@ async function processWithAgentMode( previousConversationHistory, conversationId, // Pass conversation ID for linking to conversation history sessionId, // Pass session ID for progress routing and isolation + undefined, // onProgress callback (not used here, progress is emitted internally) + screenshot, // Pass screenshot data for multimodal input ) // Mark session as completed @@ -866,13 +880,14 @@ export const router = { }), triggerMcpRecording: t.procedure - .input<{ conversationId?: string; sessionId?: string; fromTile?: boolean }>() + .input<{ conversationId?: string; sessionId?: string; fromTile?: boolean; screenshot?: string }>() .action(async ({ input }) => { const { showPanelWindowAndStartMcpRecording } = await import("./window") // Always show the panel during recording for waveform feedback // The fromTile flag tells the panel to hide after recording ends // fromButtonClick=true indicates this was triggered via UI button (not keyboard shortcut) - await showPanelWindowAndStartMcpRecording(input.conversationId, input.sessionId, input.fromTile, true) + // screenshot is passed through to the renderer for multimodal input + await showPanelWindowAndStartMcpRecording(input.conversationId, input.sessionId, input.fromTile, true, input.screenshot) }), showMainWindow: t.procedure @@ -1019,6 +1034,7 @@ export const router = { createTextInput: t.procedure .input<{ text: string + screenshot?: string }>() .action(async ({ input }) => { const config = configStore.get() @@ -1073,6 +1089,7 @@ export const router = { text: string conversationId?: string fromTile?: boolean // When true, session runs in background (snoozed) - panel won't show + screenshot?: string // Optional screenshot data URL for multimodal input }>() .action(async ({ input }) => { const config = configStore.get() @@ -1128,7 +1145,7 @@ export const router = { // This allows multiple sessions to run concurrently // Pass existingSessionId to reuse the session if found // When fromTile=true, start snoozed so the floating panel doesn't appear - processWithAgentMode(input.text, conversationId, existingSessionId, input.fromTile ?? false) + processWithAgentMode(input.text, conversationId, existingSessionId, input.fromTile ?? false, input.screenshot) .then((finalResponse) => { // Save to history after completion const history = getRecordingHistory() @@ -1161,7 +1178,7 @@ export const router = { } }) .catch((error) => { - logLLM("[createMcpTextInput] Agent processing error:", error) + logApp("[createMcpTextInput] Agent processing error:", error) }) .finally(() => { // Process queued messages after this session completes (success or error) @@ -1182,6 +1199,7 @@ export const router = { conversationId?: string sessionId?: string fromTile?: boolean // When true, session runs in background (snoozed) - panel won't show + screenshot?: string // Optional screenshot data URL for multimodal input }>() .action(async ({ input }) => { fs.mkdirSync(recordingsFolder, { recursive: true }) @@ -1359,7 +1377,8 @@ export const router = { // Fire-and-forget: Start agent processing without blocking // This allows multiple sessions to run concurrently // Pass the sessionId to avoid creating a duplicate session - processWithAgentMode(transcript, conversationId, sessionId) + // Pass startSnoozed for tile behavior and screenshot for multimodal input + processWithAgentMode(transcript, conversationId, sessionId, startSnoozed, input.screenshot) .then((finalResponse) => { // Save to history after completion const history = getRecordingHistory() @@ -2063,6 +2082,19 @@ export const router = { await shell.openPath(conversationsFolder) }), + // Display/Screen endpoints + getAvailableDisplays: t.procedure.action(async () => { + const { screen } = await import('electron') + const displays = screen.getAllDisplays() + const primaryDisplay = screen.getPrimaryDisplay() + return displays.map(d => ({ + id: d.id.toString(), + label: d.label || `Display ${d.id}`, + bounds: d.bounds, + isPrimary: d.id === primaryDisplay.id + })) + }), + // Panel resize endpoints getPanelSize: t.procedure.action(async () => { const win = WINDOWS.get("panel") diff --git a/apps/desktop/src/main/window.ts b/apps/desktop/src/main/window.ts index 0961e4285..31eabc1f3 100644 --- a/apps/desktop/src/main/window.ts +++ b/apps/desktop/src/main/window.ts @@ -4,6 +4,7 @@ import { shell, screen, app, + desktopCapturer, } from "electron" import path from "path" import { getRendererHandlers } from "@egoist/tipc/main" @@ -17,6 +18,45 @@ import { setupConsoleLogger } from "./console-logger" type WINDOW_ID = "main" | "panel" | "setup" +/** + * Capture a screenshot from the configured display (or primary display) + * Returns the screenshot as a data URL, or undefined if capture fails + */ +export async function captureScreenshotFromMain(): Promise { + try { + const config = configStore.get() + const sources = await desktopCapturer.getSources({ + types: ['screen'], + thumbnailSize: { width: 1920, height: 1080 } + }) + + if (sources.length === 0) { + if (process.platform === 'darwin') { + throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.') + } else { + throw new Error('No screen sources available') + } + } + + // Find the source matching the configured display, or use the first one (primary) + const configuredDisplayId = config.screenshotDisplayId + let source = sources[0] + if (configuredDisplayId) { + const matchingSource = sources.find(s => s.display_id === configuredDisplayId) + if (matchingSource) { + source = matchingSource + } + } + + const screenshot = source.thumbnail.toDataURL() + logApp(`[captureScreenshotFromMain] Captured screenshot from display: ${source.display_id}, size: ${screenshot.length} chars`) + return screenshot + } catch (error) { + logApp('[captureScreenshotFromMain] Failed to capture screenshot:', error) + throw error + } +} + export const WINDOWS = new Map() @@ -433,7 +473,7 @@ export async function showPanelWindowAndStartRecording(fromButtonClick?: boolean getWindowRendererHandlers("panel")?.startRecording.send({ fromButtonClick }) } -export async function showPanelWindowAndStartMcpRecording(conversationId?: string, sessionId?: string, fromTile?: boolean, fromButtonClick?: boolean) { +export async function showPanelWindowAndStartMcpRecording(conversationId?: string, sessionId?: string, fromTile?: boolean, fromButtonClick?: boolean, screenshot?: string) { // Capture focus before showing panel try { const focusedApp = await getFocusedAppInfo() @@ -446,11 +486,25 @@ export async function showPanelWindowAndStartMcpRecording(conversationId?: strin state.isRecordingFromButtonClick = fromButtonClick ?? false state.isRecordingMcpMode = true + // Auto-capture screenshot if enabled for voice commands and no screenshot was explicitly passed + let effectiveScreenshot = screenshot + if (!effectiveScreenshot) { + const config = configStore.get() + if (config.screenshotForVoiceCommands) { + try { + effectiveScreenshot = await captureScreenshotFromMain() + } catch (error) { + // Log but continue - recording can proceed without screenshot + logApp('[showPanelWindowAndStartMcpRecording] Screenshot capture failed:', error) + } + } + } + // Ensure consistent sizing by setting mode in main before showing setPanelMode("normal") showPanelWindow() - // Pass fromTile and fromButtonClick flags so panel knows how to behave after recording ends - getWindowRendererHandlers("panel")?.startMcpRecording.send({ conversationId, sessionId, fromTile, fromButtonClick }) + // Pass fromTile, fromButtonClick, and screenshot flags so panel knows how to behave after recording ends + getWindowRendererHandlers("panel")?.startMcpRecording.send({ conversationId, sessionId, fromTile, fromButtonClick, screenshot: effectiveScreenshot }) } export async function showPanelWindowAndShowTextInput() { diff --git a/apps/desktop/src/preload/index.d.ts b/apps/desktop/src/preload/index.d.ts index 8b2bd139d..b4d6400b3 100644 --- a/apps/desktop/src/preload/index.d.ts +++ b/apps/desktop/src/preload/index.d.ts @@ -1,5 +1,13 @@ import { ElectronAPI } from "@electron-toolkit/preload" +interface ScreenSource { + id: string + name: string + thumbnail: string // Data URL + display_id: string + appIcon: string | null // Data URL or null +} + declare global { interface Window { electron: ElectronAPI @@ -9,6 +17,7 @@ declare global { getOAuthStatus: (serverName: string) => Promise<{ configured: boolean; authenticated: boolean; tokenExpiry?: number; error?: string }> revokeOAuthTokens: (serverName: string) => Promise<{ success: boolean; error?: string }> testMCPServer: (serverName: string, config: any) => Promise<{ success: boolean; error?: string }> + getScreenSources: (options: { types: string[], thumbnailSize?: { width: number, height: number } }) => Promise } } } diff --git a/apps/desktop/src/preload/index.ts b/apps/desktop/src/preload/index.ts index 2757b1112..026c31b7d 100644 --- a/apps/desktop/src/preload/index.ts +++ b/apps/desktop/src/preload/index.ts @@ -10,6 +10,9 @@ const api = { getOAuthStatus: (serverName: string) => ipcRenderer.invoke('getOAuthStatus', serverName), revokeOAuthTokens: (serverName: string) => ipcRenderer.invoke('revokeOAuthTokens', serverName), testMCPServer: (serverName: string, config: any) => ipcRenderer.invoke('testMCPServer', { serverName, config }), + // Screenshot API - uses IPC to main process (desktopCapturer is only available in main process in Electron 31+) + getScreenSources: (options: { types: string[], thumbnailSize?: { width: number, height: number } }) => + ipcRenderer.invoke('getScreenSources', options) } if (process.contextIsolated) { diff --git a/apps/desktop/src/renderer/src/components/text-input-panel.tsx b/apps/desktop/src/renderer/src/components/text-input-panel.tsx index 8fb72b989..854cc2b04 100644 --- a/apps/desktop/src/renderer/src/components/text-input-panel.tsx +++ b/apps/desktop/src/renderer/src/components/text-input-panel.tsx @@ -4,9 +4,19 @@ import { cn } from "@renderer/lib/utils" import { AgentProcessingView } from "./agent-processing-view" import { AgentProgressUpdate } from "../../../shared/types" import { useTheme } from "@renderer/contexts/theme-context" +import { Camera, Eye } from "lucide-react" +import { useConfigQuery } from "@renderer/lib/query-client" +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, + DialogDescription, +} from "./ui/dialog" +import { Button } from "./ui/button" interface TextInputPanelProps { - onSubmit: (text: string) => void + onSubmit: (text: string, screenshot?: string) => void onCancel: () => void isProcessing?: boolean agentProgress?: AgentProgressUpdate | null @@ -22,10 +32,26 @@ export const TextInputPanel = forwardRef isProcessing = false, agentProgress, }, ref) => { + const configQuery = useConfigQuery() + const alwaysIncludeScreenshot = configQuery.data?.alwaysIncludeScreenshot ?? false + const [text, setText] = useState("") + const [includeScreenshot, setIncludeScreenshot] = useState(alwaysIncludeScreenshot) + const [screenshot, setScreenshot] = useState(null) + const [isCapturingScreenshot, setIsCapturingScreenshot] = useState(false) + const [screenshotError, setScreenshotError] = useState(null) + const [previewOpen, setPreviewOpen] = useState(false) + const [previewImage, setPreviewImage] = useState(null) + const [previewImageInfo, setPreviewImageInfo] = useState<{ width: number; height: number; size: string } | null>(null) const textareaRef = useRef(null) + const captureWantedRef = useRef(false) const { isDark } = useTheme() + // Sync includeScreenshot state when config loads or alwaysIncludeScreenshot setting changes + useEffect(() => { + setIncludeScreenshot(alwaysIncludeScreenshot) + }, [alwaysIncludeScreenshot]) + useImperativeHandle(ref, () => ({ focus: () => { textareaRef.current?.focus() @@ -52,13 +78,147 @@ export const TextInputPanel = forwardRef return undefined }, [isProcessing]) + // Helper to find the correct source based on configured display ID + const findSourceByDisplayId = (sources: Array<{ id: string, name: string, thumbnail: string, display_id: string }>, configuredDisplayId: string | undefined) => { + if (!configuredDisplayId || configuredDisplayId === '') { + // No configured display ID, use first source (primary display) + return sources[0] + } + // Find source matching the configured display_id + const matchingSource = sources.find(s => s.display_id === configuredDisplayId) + if (matchingSource) { + console.log('[TextInputPanel] Found matching source for display_id:', configuredDisplayId) + return matchingSource + } + // Fall back to first source if configured display not found + console.log('[TextInputPanel] Configured display_id not found, falling back to primary:', configuredDisplayId) + return sources[0] + } + + const captureScreenshot = async () => { + console.log('[TextInputPanel] captureScreenshot called') + setIsCapturingScreenshot(true) + setScreenshotError(null) + try { + // Use IPC to get screen sources from main process (desktopCapturer is only available in main process in Electron 31+) + console.log('[TextInputPanel] Calling getScreenSources...') + const sources = await (window as any).electronAPI.getScreenSources({ + types: ['screen'], + thumbnailSize: { width: 1920, height: 1080 } + }) + console.log('[TextInputPanel] Got sources:', sources?.length || 0) + + // Check if screenshot is still wanted after async operation completes + if (!captureWantedRef.current) { + console.log('[TextInputPanel] Screenshot no longer wanted, discarding') + return + } + + if (sources && sources.length > 0) { + // Get the source matching the configured display, or fallback to primary + const configuredDisplayId = configQuery.data?.screenshotDisplayId + const source = findSourceByDisplayId(sources, configuredDisplayId) + const screenshot = source.thumbnail + console.log('[TextInputPanel] Screenshot captured from display_id:', source.display_id, 'length:', screenshot?.length || 0) + setScreenshot(screenshot) + } else { + console.log('[TextInputPanel] No sources returned') + } + } catch (error: any) { + console.error('[TextInputPanel] Failed to capture screenshot:', error) + // Show the actual error message if available (e.g., permission error on macOS) + const errorMessage = error?.message || 'Failed to capture screenshot' + setScreenshotError(errorMessage) + setIncludeScreenshot(false) + } finally { + setIsCapturingScreenshot(false) + } + } + + const handlePreviewScreenshot = async () => { + try { + let imageToPreview: string + + // Use existing screenshot if available, otherwise capture a fresh one + if (screenshot) { + // Use the existing screenshot state - this is what will actually be sent + imageToPreview = screenshot + } else { + // No screenshot exists yet, capture one and update state so preview matches what will be sent + const sources = await (window as any).electronAPI.getScreenSources({ + types: ['screen'], + thumbnailSize: { width: 1920, height: 1080 } + }) + + if (!sources || sources.length === 0) { + console.error('[TextInputPanel] No sources available for preview') + return + } + + // Get the source matching the configured display, or fallback to primary + const configuredDisplayId = configQuery.data?.screenshotDisplayId + const source = findSourceByDisplayId(sources, configuredDisplayId) + imageToPreview = source.thumbnail as string + + // Update the screenshot state so it matches what we're previewing + setScreenshot(imageToPreview) + } + + setPreviewImage(imageToPreview) + + // Calculate image info + const img = new Image() + img.onload = () => { + // Calculate approximate size of base64 data + const base64Length = imageToPreview.length - (imageToPreview.indexOf(',') + 1) + const sizeInBytes = Math.ceil(base64Length * 0.75) + const sizeInKB = (sizeInBytes / 1024).toFixed(1) + const sizeStr = sizeInBytes > 1024 * 1024 + ? `${(sizeInBytes / (1024 * 1024)).toFixed(2)} MB` + : `${sizeInKB} KB` + + setPreviewImageInfo({ + width: img.naturalWidth, + height: img.naturalHeight, + size: sizeStr + }) + } + img.src = imageToPreview + + setPreviewOpen(true) + } catch (error) { + console.error('[TextInputPanel] Failed to capture preview screenshot:', error) + } + } + const handleSubmit = () => { if (text.trim() && !isProcessing) { - onSubmit(text.trim()) + // Only include screenshot if the checkbox is still checked + const screenshotToSend = includeScreenshot && screenshot ? screenshot : undefined + console.log('[TextInputPanel] handleSubmit called, screenshot:', screenshotToSend ? `${screenshotToSend.length} chars` : 'none') + onSubmit(text.trim(), screenshotToSend) setText("") + setScreenshot(null) + setIncludeScreenshot(alwaysIncludeScreenshot) } } + // Capture screenshot when checkbox is toggled on, clear when toggled off + useEffect(() => { + if (includeScreenshot) { + captureWantedRef.current = true + if (!screenshot) { + captureScreenshot() + } + } else { + captureWantedRef.current = false + if (screenshot) { + // Clear screenshot when user unchecks the box + setScreenshot(null) + } + } + }, [includeScreenshot, screenshot]) + const handleKeyDown = (e: React.KeyboardEvent) => { const isModifierPressed = e.metaKey || e.ctrlKey; @@ -134,6 +294,43 @@ export const TextInputPanel = forwardRef disabled={isProcessing} aria-label="Message input" /> + + {/* Screenshot option */} +
+ + {includeScreenshot && ( + + )} + {isCapturingScreenshot && ( + Capturing... + )} + {screenshot && !isCapturingScreenshot && ( + ✓ Screenshot captured + )} + {screenshotError && !isCapturingScreenshot && ( + + ✗ {screenshotError.includes('Screen Recording') ? 'Screen Recording permission required' : screenshotError} + + )} +
)} @@ -167,6 +364,32 @@ export const TextInputPanel = forwardRef + + {/* Screenshot Preview Dialog */} + + + + Screenshot Preview + + This is what will be sent with your message + {previewImageInfo && ( + + ({previewImageInfo.width} × {previewImageInfo.height}, {previewImageInfo.size}) + + )} + + +
+ {previewImage && ( + Screenshot preview + )} +
+
+
) }) diff --git a/apps/desktop/src/renderer/src/pages/panel.tsx b/apps/desktop/src/renderer/src/pages/panel.tsx index acbf9dbf1..f156972f3 100644 --- a/apps/desktop/src/renderer/src/pages/panel.tsx +++ b/apps/desktop/src/renderer/src/pages/panel.tsx @@ -38,6 +38,7 @@ export function Component() { const mcpConversationIdRef = useRef(undefined) const mcpSessionIdRef = useRef(undefined) const fromTileRef = useRef(false) + const mcpScreenshotRef = useRef(undefined) const [fromButtonClick, setFromButtonClick] = useState(false) const { isDark } = useTheme() const lastRequestedModeRef = useRef<"normal" | "agent" | "textInput">("normal") @@ -225,11 +226,13 @@ export function Component() { const conversationIdForMcp = mcpConversationIdRef.current ?? currentConversationId const sessionIdForMcp = mcpSessionIdRef.current const wasFromTile = fromTileRef.current + const screenshotForMcp = mcpScreenshotRef.current // Clear the refs after capturing to avoid reusing stale IDs mcpConversationIdRef.current = undefined mcpSessionIdRef.current = undefined fromTileRef.current = false + mcpScreenshotRef.current = undefined // If recording was from a tile, hide the floating panel immediately // The session will continue in the tile view @@ -251,6 +254,8 @@ export function Component() { sessionId: sessionIdForMcp, // Pass fromTile so session starts snoozed when recording was from a tile fromTile: wasFromTile, + // Pass screenshot for multimodal input if provided + screenshot: screenshotForMcp, }) // NOTE: Do NOT call continueConversation here! @@ -281,8 +286,8 @@ export function Component() { }) const textInputMutation = useMutation({ - mutationFn: async ({ text }: { text: string }) => { - await tipcClient.createTextInput({ text }) + mutationFn: async ({ text, screenshot }: { text: string; screenshot?: string }) => { + await tipcClient.createTextInput({ text, screenshot }) }, onError(error) { setShowTextInput(false) @@ -307,11 +312,13 @@ export function Component() { mutationFn: async ({ text, conversationId, + screenshot, }: { text: string conversationId?: string + screenshot?: string }) => { - const result = await tipcClient.createMcpTextInput({ text, conversationId }) + const result = await tipcClient.createMcpTextInput({ text, conversationId, screenshot }) // NOTE: Do NOT call continueConversation here! // The currentConversationId should only be set through explicit user actions @@ -506,7 +513,7 @@ export function Component() { return unlisten }, []) - const handleTextSubmit = async (text: string) => { + const handleTextSubmit = async (text: string, screenshot?: string) => { // Capture the conversation ID at submit time - if user explicitly continued a conversation // from history, currentConversationId will be set. Otherwise it's null for new inputs. const conversationIdForMcp = currentConversationId @@ -523,13 +530,14 @@ export function Component() { // Ensure main process no longer treats panel as textInput mode tipcClient.clearTextInputState({}) - // Always use MCP processing + // Always use MCP processing with optional screenshot mcpTextInputMutation.mutate({ text, // Pass currentConversationId if user explicitly continued from history, // otherwise undefined to create a fresh conversation. // This prevents message leaking while still supporting explicit continuation. conversationId: conversationIdForMcp ?? undefined, + screenshot, }) } @@ -538,10 +546,11 @@ export function Component() { // MCP handlers useEffect(() => { const unlisten = rendererHandlers.startMcpRecording.listen((data) => { - // Store the conversationId, sessionId, and fromTile flag for use when recording ends + // Store the conversationId, sessionId, fromTile flag, and screenshot for use when recording ends mcpConversationIdRef.current = data?.conversationId mcpSessionIdRef.current = data?.sessionId fromTileRef.current = data?.fromTile ?? false + mcpScreenshotRef.current = data?.screenshot // Track if recording was triggered via UI button click vs keyboard shortcut // When true, we show "Enter" as the submit hint instead of "Release keys" setFromButtonClick(data?.fromButtonClick ?? false) @@ -581,9 +590,10 @@ export function Component() { isConfirmedRef.current = true recorderRef.current?.stopRecording() } else { - // Store the conversationId and sessionId for use when recording ends + // Store the conversationId, sessionId, and screenshot for use when recording ends mcpConversationIdRef.current = data?.conversationId mcpSessionIdRef.current = data?.sessionId + mcpScreenshotRef.current = data?.screenshot // Track if recording was triggered via UI button click vs keyboard shortcut setFromButtonClick(data?.fromButtonClick ?? false) setMcpMode(true) diff --git a/apps/desktop/src/renderer/src/pages/settings-general.tsx b/apps/desktop/src/renderer/src/pages/settings-general.tsx index a6159b7e6..8bdaa51dc 100644 --- a/apps/desktop/src/renderer/src/pages/settings-general.tsx +++ b/apps/desktop/src/renderer/src/pages/settings-general.tsx @@ -104,6 +104,22 @@ export function Component() { const shortcut = (configQuery.data as any)?.shortcut || "hold-ctrl" const textInputShortcut = (configQuery.data as any)?.textInputShortcut || "ctrl-t" + // State for available displays + const [displays, setDisplays] = useState>([]) + + // Fetch available displays + useEffect(() => { + tipcClient.getAvailableDisplays().then((result: typeof displays) => { + setDisplays(result || []) + }).catch((err: unknown) => { + console.error('Failed to get displays:', err) + }) + }, []) if (!configQuery.data) return null @@ -353,6 +369,19 @@ export function Component() { placeholder="Click to record custom text input shortcut" /> )} + +
+ { + saveConfig({ + alwaysIncludeScreenshot: checked, + }) + }} + disabled={!configQuery.data?.textInputEnabled} + /> + Always include screenshot with messages +
@@ -729,6 +758,39 @@ export function Component() { + {/* Screenshot Settings */} + + } className="px-3"> + + + + } className="px-3"> + saveConfig({ screenshotForVoiceCommands: value })} + /> + + + {/* Agent Settings */} } className="px-3"> diff --git a/apps/desktop/src/shared/types.ts b/apps/desktop/src/shared/types.ts index e4bd1caca..b3e7d1f9e 100644 --- a/apps/desktop/src/shared/types.ts +++ b/apps/desktop/src/shared/types.ts @@ -353,6 +353,10 @@ export type Config = { textInputShortcut?: "ctrl-t" | "ctrl-shift-t" | "alt-t" | "custom" customTextInputShortcut?: string + // Screenshot Configuration + alwaysIncludeScreenshot?: boolean + screenshotForVoiceCommands?: boolean // Auto-capture screenshot when using voice commands (MCP/agent mode) + // Settings Window Hotkey Configuration settingsHotkeyEnabled?: boolean settingsHotkey?: "ctrl-shift-s" | "ctrl-comma" | "ctrl-shift-comma" | "custom" @@ -468,6 +472,10 @@ export type Config = { streamStatusWatcherEnabled?: boolean streamStatusFilePath?: string + // Screenshot Display Configuration + // Store the preferred display ID for screenshot capture (empty = primary display) + screenshotDisplayId?: string + }