Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/desktop/speakmcp-rs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions apps/desktop/src/main/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ const getConfig = () => {
mcpAutoPasteDelay: 1000, // 1 second delay by default
mcpMaxIterations: 10, // Default max iterations for agent mode
textInputEnabled: true,
alwaysIncludeScreenshot: false,
screenshotForVoiceCommands: false, // Auto-capture screenshot when using voice commands (MCP/agent mode)

// Text input: On Windows, use Ctrl+Shift+T to avoid browser new tab conflict
textInputShortcut: isWindows ? "ctrl-shift-t" : "ctrl-t",
Expand Down
41 changes: 32 additions & 9 deletions apps/desktop/src/main/context-budget.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,28 @@ import { makeTextCompletionWithFetch } from "./llm-fetch"
import { constructMinimalSystemPrompt } from "./system-prompts"
import { agentSessionStateManager } from "./state"

export type LLMMessage = { role: string; content: string }
export type LLMMessage = { role: string; content: string | any[] }

// Helper function to get content length that handles both string and array
function getContentLength(content: string | any[] | undefined): number {
if (!content) return 0
if (typeof content === 'string') return content.length
// For multimodal content, sum up text parts only
return content
.filter((part: any) => part.type === 'text')
.reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
}

// Helper function to get content as string for summarization
function getContentAsString(content: string | any[] | undefined): string {
if (!content) return ''
if (typeof content === 'string') return content
// For multimodal content, extract text parts only
return content
.filter((part: any) => part.type === 'text')
.map((part: any) => part.text || '')
.join('\n')
}

// Simple in-memory cache for provider/model context windows
const contextWindowCache = new Map<string, number>()
Expand Down Expand Up @@ -77,7 +98,7 @@ async function getMaxContextTokens(providerId: string, model: string): Promise<n

function estimateTokensFromMessages(messages: LLMMessage[]): number {
// Rough estimate: 4 chars ≈ 1 token
const totalChars = messages.reduce((sum, m) => sum + (m.content?.length || 0), 0)
const totalChars = messages.reduce((sum, m) => sum + getContentLength(m.content), 0)
return Math.ceil(totalChars / 4)
}

Expand Down Expand Up @@ -196,14 +217,15 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
const AGGRESSIVE_TRUNCATE_THRESHOLD = 5000
for (let i = 0; i < messages.length; i++) {
const msg = messages[i]
if (msg.role === "user" && msg.content && msg.content.length > AGGRESSIVE_TRUNCATE_THRESHOLD) {
if (msg.role === "user" && getContentLength(msg.content) > AGGRESSIVE_TRUNCATE_THRESHOLD) {
// Check if this looks like a tool result (contains JSON arrays/objects)
if (msg.content.includes('"url":') || msg.content.includes('"id":')) {
const contentStr = getContentAsString(msg.content)
if (contentStr.includes('"url":') || contentStr.includes('"id":')) {
// Truncate aggressively and add note
messages[i] = {
...msg,
content: msg.content.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
'\n\n... (truncated ' + (msg.content.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
content: contentStr.substring(0, AGGRESSIVE_TRUNCATE_THRESHOLD) +
'\n\n... (truncated ' + (contentStr.length - AGGRESSIVE_TRUNCATE_THRESHOLD) +
' characters for context management. Key information preserved above.)'
}
applied.push("aggressive_truncate")
Expand All @@ -218,7 +240,7 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa

// Tier 1: Summarize large messages (prefer tool outputs or very long entries)
const indicesByLength = messages
.map((m, i) => ({ i, len: m.content?.length || 0, role: m.role, content: m.content }))
.map((m, i) => ({ i, len: getContentLength(m.content), role: m.role, content: m.content }))
.filter((x) => x.len > summarizeThreshold && x.role !== "system")
.sort((a, b) => b.len - a.len)

Expand All @@ -234,15 +256,16 @@ export async function shrinkMessagesForLLM(opts: ShrinkOptions): Promise<{ messa
// Emit progress update before summarization
summarizedCount++
if (opts.onSummarizationProgress) {
const messagePreview = item.content!.substring(0, 100).replace(/\n/g, ' ')
const contentStr = getContentAsString(item.content)
const messagePreview = contentStr.substring(0, 100).replace(/\n/g, ' ')
opts.onSummarizationProgress(
summarizedCount,
totalToSummarize,
`Summarizing large message ${summarizedCount}/${totalToSummarize} (${item.len} chars): ${messagePreview}...`
)
}

const summarized = await summarizeContent(item.content!, opts.sessionId)
const summarized = await summarizeContent(getContentAsString(item.content), opts.sessionId)
messages[item.i] = { ...messages[item.i], content: summarized }
applied.push("summarize")
tokens = estimateTokensFromMessages(messages)
Expand Down
50 changes: 49 additions & 1 deletion apps/desktop/src/main/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { app, Menu } from "electron"
import { app, Menu, ipcMain, desktopCapturer } from "electron"
import { electronApp, optimizer } from "@electron-toolkit/utils"
import {
createMainWindow,
Expand Down Expand Up @@ -42,6 +42,54 @@ app.whenReady().then(() => {
registerIpcMain(router)
logApp("IPC main registered")

// Register desktopCapturer handler (available only in main process in Electron 31+)
ipcMain.handle('getScreenSources', async (_event, options: { types: ('screen' | 'window')[], thumbnailSize?: { width: number, height: number } }) => {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getScreenSources forwards renderer-supplied options directly into desktopCapturer.getSources; consider validating/clamping types and thumbnailSize so a renderer bug (or injected script) can’t request unexpected capture modes or extreme sizes. Even if the UI currently uses only screen at 1920×1080, tightening this IPC surface helps prevent accidental DoS/privacy regressions.

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎

try {
// Validate and sanitize options
const validatedOptions = {
// Only allow 'screen' type for privacy - filter out 'window'
types: (options.types || ['screen']).filter(t => t === 'screen') as ('screen' | 'window')[],
thumbnailSize: {
// Clamp dimensions to reasonable bounds
width: Math.min(Math.max(options.thumbnailSize?.width || 1920, 100), 4096),
height: Math.min(Math.max(options.thumbnailSize?.height || 1080, 100), 4096)
}
}

// Ensure at least 'screen' type is present
if (validatedOptions.types.length === 0) {
validatedOptions.types = ['screen']
}

logApp('[getScreenSources] Capturing screen sources with validated options:', JSON.stringify(validatedOptions))
const sources = await desktopCapturer.getSources(validatedOptions)
logApp(`[getScreenSources] Got ${sources.length} sources`)

// On macOS, if Screen Recording permission is not granted, desktopCapturer returns an empty array
// This is a silent failure - no error is thrown
if (sources.length === 0 && process.platform === 'darwin') {
throw new Error('No screen sources available. Please grant Screen Recording permission in System Settings > Privacy & Security > Screen Recording, then restart the app.')
}

// Serialize the sources - NativeImage thumbnail needs to be converted
const serialized = sources.map(source => {
const thumbnailDataUrl = source.thumbnail.toDataURL()
logApp(`[getScreenSources] Source: ${source.name}, thumbnail size: ${thumbnailDataUrl.length} chars`)
return {
id: source.id,
name: source.name,
thumbnail: thumbnailDataUrl,
display_id: source.display_id,
appIcon: source.appIcon ? source.appIcon.toDataURL() : null
}
})
return serialized
} catch (error) {
console.error('Failed to get screen sources:', error)
throw error
}
})

registerServeProtocol()

try {
Expand Down
84 changes: 70 additions & 14 deletions apps/desktop/src/main/llm-fetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,38 @@ import { isDebugLLM, logLLM } from "./debug"
import { state, llmRequestAbortManager, agentSessionStateManager } from "./state"
import OpenAI from "openai"

/**
* Helper function to get a string preview from content that may be a string or multimodal array
*/
function getContentPreview(content: string | any[] | undefined, maxLength: number = 100): string {
if (!content) return "(empty)"
if (typeof content === "string") {
return content.length > maxLength ? content.substring(0, maxLength) + "..." : content
}
// It's an array (multimodal content)
const textParts = content
.filter((part: any) => part.type === "text")
.map((part: any) => part.text)
.join(" ")
const hasImage = content.some((part: any) => part.type === "image_url")
const preview = textParts.length > maxLength ? textParts.substring(0, maxLength) + "..." : textParts
return hasImage ? `[image] ${preview}` : preview
}

/**
* Helper function to get content length from content that may be a string or multimodal array
*/
function getContentLength(content: string | any[] | undefined): number {
if (!content) return 0
if (typeof content === "string") {
return content.length
}
// It's an array (multimodal content) - sum up text lengths
return content
.filter((part: any) => part.type === "text")
.reduce((sum: number, part: any) => sum + (part.text?.length || 0), 0)
}

/**
* Callback for reporting retry progress to the UI
*/
Expand Down Expand Up @@ -658,23 +690,22 @@ async function makeAPICallAttempt(
messagesCount: requestBody.messages.length,
responseFormat: requestBody.response_format,
estimatedTokens,
totalPromptLength: (requestBody.messages as Array<{ role: string; content: string }>).reduce(
(sum: number, msg: { role: string; content: string }) => sum + ((msg.content?.length) || 0),
totalPromptLength: (requestBody.messages as Array<{ role: string; content: string | any[] }>).reduce(
(sum: number, msg: { role: string; content: string | any[] }) => sum + getContentLength(msg.content),
0,
),
contextWarning: estimatedTokens > 8000 ? "WARNING: High token count, may exceed context limit" : null
})
logLLM("Request Body (truncated)", {
...requestBody,
messages: (requestBody.messages as Array<{ role: string; content: string }>).map(
(msg: { role: string; content: string }) => ({
messages: (requestBody.messages as Array<{ role: string; content: string | any[] }>).map(
(msg: { role: string; content: string | any[] }) => ({
role: msg.role,
content: msg.content.length > 200
? msg.content.substring(0, 200) + "... [" + msg.content.length + " chars]"
: msg.content,
content: getContentPreview(msg.content, 200),
}),
)
})

}

// Create abort controller and register it so emergency stop can cancel
Expand Down Expand Up @@ -841,7 +872,7 @@ async function makeAPICallAttempt(
* Make a fetch-based LLM call for OpenAI-compatible APIs with structured output fallback
*/
async function makeOpenAICompatibleCall(
messages: Array<{ role: string; content: string }>,
messages: Array<{ role: string; content: string | any[] }>,
providerId: string,
useStructuredOutput: boolean = true,
sessionId?: string,
Expand All @@ -861,7 +892,10 @@ async function makeOpenAICompatibleCall(
}

const model = getModel(providerId, "mcp")
const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => sum + msg.content.length, 0) / 4)
const estimatedTokens = Math.ceil(messages.reduce((sum, msg) => {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With multimodal messages (array content), the request logging in makeAPICallAttempt that truncates message content via substring assumes string content and will throw in debug mode. Consider making the debug logging robust to array content to avoid runtime errors when isDebugLLM() is true.

🤖 React with 👍 or 👎 to let us know if the comment was useful, or 🚀 if it prevented an incident/outage.

const contentLength = typeof msg.content === 'string' ? msg.content.length : JSON.stringify(msg.content).length
return sum + contentLength
}, 0) / 4)

const baseRequestBody = {
model,
Expand Down Expand Up @@ -1016,7 +1050,7 @@ async function makeOpenAICompatibleCall(
* Make a fetch-based LLM call for Gemini API
*/
async function makeGeminiCall(
messages: Array<{ role: string; content: string }>,
messages: Array<{ role: string; content: string | any[] }>,
sessionId?: string,
onRetryProgress?: RetryProgressCallback,
): Promise<any> {
Expand All @@ -1030,8 +1064,30 @@ async function makeGeminiCall(
const baseURL =
config.geminiBaseUrl || "https://generativelanguage.googleapis.com"

// Helper to extract text from multimodal content without embedding full image data
const extractTextFromContent = (content: string | any[]): string => {
if (typeof content === 'string') {
return content
}
if (Array.isArray(content)) {
return content.map(part => {
if (part.type === 'text') {
return part.text || ''
}
if (part.type === 'image_url') {
return '[image attached]'
}
return ''
}).filter(Boolean).join(' ')
}
return String(content)
}

// Convert messages to Gemini format
const prompt = messages.map((m) => `${m.role}: ${m.content}`).join("\n\n")
const prompt = messages.map((m) => {
const content = extractTextFromContent(m.content)
return `${m.role}: ${content}`
}).join("\n\n")

return apiCallWithRetry(async () => {
if (isDebugLLM()) {
Expand Down Expand Up @@ -1144,7 +1200,7 @@ async function makeGeminiCall(
* This is wrapped by makeLLMCallWithFetch with retry logic
*/
async function makeLLMCallAttempt(
messages: Array<{ role: string; content: string }>,
messages: Array<{ role: string; content: string | any[] }>,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makeLLMCallAttempt now allows content to be an array, but the debug log inside this function still does messages[...].content.substring(...) which will throw when isDebugLLM() is enabled and a screenshot is attached. The debug/logging path should tolerate non-string content values.

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎

chatProviderId: string,
onRetryProgress?: RetryProgressCallback,
sessionId?: string,
Expand All @@ -1153,7 +1209,7 @@ async function makeLLMCallAttempt(
logLLM("🚀 Starting LLM call attempt", {
provider: chatProviderId,
messagesCount: messages.length,
lastMessagePreview: messages[messages.length - 1]?.content?.substring(0, 100) + "..."
lastMessagePreview: getContentPreview(messages[messages.length - 1]?.content, 100)
})
}

Expand Down Expand Up @@ -1323,7 +1379,7 @@ async function makeLLMCallAttempt(
* Main function to make LLM calls using fetch with automatic retry on empty responses
*/
export async function makeLLMCallWithFetch(
messages: Array<{ role: string; content: string }>,
messages: Array<{ role: string; content: string | any[] }>,
providerId?: string,
onRetryProgress?: RetryProgressCallback,
sessionId?: string,
Expand Down
Loading