feat: enhance MCP tool response handling to support images

shivangag · shivangag · commit a05620b7cdf5 · 2025-09-09T03:31:23.000+05:30
**Changes:**
- Updated `processToolContent` to return both text and images from tool results.
- Modified `executeToolAndProcessResult` to handle and pass images to the response.
- Adjusted `combineCommandSequences` to preserve images from MCP server responses.
- Updated UI components to display images alongside text responses.

**Testing:**
- Added tests to verify correct handling of tool results with text and images.
- Ensured that image-only responses are processed correctly.

**Files Modified:**
- `src/core/prompts/responses.ts`
- `src/core/tools/useMcpToolTool.ts`
- `src/shared/combineCommandSequences.ts`
- `webview-ui/src/components/chat/McpExecution.tsx`
- `webview-ui/src/components/chat/ChatRow.tsx`
- Test files for MCP tool functionality.
diff --git a/src/core/prompts/responses.ts b/src/core/prompts/responses.ts
@@ -87,10 +87,16 @@ Otherwise, if you have not completed the task and do not need additional informa
 		images?: string[],
 	): string | Array<Anthropic.TextBlockParam | Anthropic.ImageBlockParam> => {
 		if (images && images.length > 0) {
-			const textBlock: Anthropic.TextBlockParam = { type: "text", text }
 			const imageBlocks: Anthropic.ImageBlockParam[] = formatImagesIntoBlocks(images)
-			// Placing images after text leads to better results
-			return [textBlock, ...imageBlocks]
+
+			if (text.trim()) {
+				const textBlock: Anthropic.TextBlockParam = { type: "text", text }
+				// Placing images after text leads to better results
+				return [textBlock, ...imageBlocks]
+			} else {
+				// For image-only responses, return only image blocks
+				return imageBlocks
+			}
 		} else {
 			return text
 		}
diff --git a/src/core/tools/__tests__/useMcpToolTool.spec.ts b/src/core/tools/__tests__/useMcpToolTool.spec.ts
@@ -7,7 +7,12 @@ import { ToolUse } from "../../../shared/tools"
 // Mock dependencies
 vi.mock("../../prompts/responses", () => ({
 	formatResponse: {
-		toolResult: vi.fn((result: string) => `Tool result: ${result}`),
+		toolResult: vi.fn((result: string, images?: string[]) => {
+			if (images && images.length > 0) {
+				return `Tool result: ${result} (with ${images.length} images)`
+			}
+			return `Tool result: ${result}`
+		}),
 		toolError: vi.fn((error: string) => `Tool error: ${error}`),
 		invalidMcpToolArgumentError: vi.fn((server: string, tool: string) => `Invalid args for ${server}:${tool}`),
 		unknownMcpToolError: vi.fn((server: string, tool: string, availableTools: string[]) => {
@@ -223,10 +228,111 @@ describe("useMcpToolTool", () => {
 			expect(mockTask.consecutiveMistakeCount).toBe(0)
 			expect(mockAskApproval).toHaveBeenCalled()
 			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
-			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Tool executed successfully")
+			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Tool executed successfully", [])
 			expect(mockPushToolResult).toHaveBeenCalledWith("Tool result: Tool executed successfully")
 		})
 
+		it("should handle tool result with text and images", async () => {
+			const block: ToolUse = {
+				type: "tool_use",
+				name: "use_mcp_tool",
+				params: {
+					server_name: "test_server",
+					tool_name: "test_tool",
+					arguments: '{"param": "value"}',
+				},
+				partial: false,
+			}
+
+			mockAskApproval.mockResolvedValue(true)
+
+			const mockToolResult = {
+				content: [
+					{ type: "text", text: "Generated image:" },
+					{
+						type: "image",
+						data: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+						mimeType: "image/png",
+					},
+				],
+				isError: false,
+			}
+
+			mockProviderRef.deref.mockReturnValue({
+				getMcpHub: () => ({
+					callTool: vi.fn().mockResolvedValue(mockToolResult),
+				}),
+				postMessageToWebview: vi.fn(),
+			})
+
+			await useMcpToolTool(
+				mockTask as Task,
+				block,
+				mockAskApproval,
+				mockHandleError,
+				mockPushToolResult,
+				mockRemoveClosingTag,
+			)
+
+			expect(mockTask.consecutiveMistakeCount).toBe(0)
+			expect(mockAskApproval).toHaveBeenCalled()
+			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
+			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "Generated image:", [
+				"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+			])
+			expect(mockPushToolResult).toHaveBeenCalledWith("Tool result: Generated image: (with 1 images)")
+		})
+
+		it("should handle tool result with only images (no text)", async () => {
+			const block: ToolUse = {
+				type: "tool_use",
+				name: "use_mcp_tool",
+				params: {
+					server_name: "test_server",
+					tool_name: "test_tool",
+					arguments: '{"param": "value"}',
+				},
+				partial: false,
+			}
+
+			mockAskApproval.mockResolvedValue(true)
+
+			const mockToolResult = {
+				content: [
+					{
+						type: "image",
+						data: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+						mimeType: "image/png",
+					},
+				],
+				isError: false,
+			}
+
+			mockProviderRef.deref.mockReturnValue({
+				getMcpHub: () => ({
+					callTool: vi.fn().mockResolvedValue(mockToolResult),
+				}),
+				postMessageToWebview: vi.fn(),
+			})
+
+			await useMcpToolTool(
+				mockTask as Task,
+				block,
+				mockAskApproval,
+				mockHandleError,
+				mockPushToolResult,
+				mockRemoveClosingTag,
+			)
+
+			expect(mockTask.consecutiveMistakeCount).toBe(0)
+			expect(mockAskApproval).toHaveBeenCalled()
+			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_request_started")
+			expect(mockTask.say).toHaveBeenCalledWith("mcp_server_response", "", [
+				"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+			])
+			expect(mockPushToolResult).toHaveBeenCalledWith("Tool result:  (with 1 images)")
+		})
+
 		it("should handle user rejection", async () => {
 			const block: ToolUse = {
 				type: "tool_use",
diff --git a/src/core/tools/useMcpToolTool.ts b/src/core/tools/useMcpToolTool.ts
@@ -195,24 +195,39 @@ async function sendExecutionStatus(cline: Task, status: McpExecutionStatus): Pro
 	})
 }
 
-function processToolContent(toolResult: any): string {
+function processToolContent(toolResult: any): { text: string; images: string[] } {
 	if (!toolResult?.content || toolResult.content.length === 0) {
-		return ""
+		return { text: "", images: [] }
 	}
 
-	return toolResult.content
-		.map((item: any) => {
-			if (item.type === "text") {
-				return item.text
+	const textParts: string[] = []
+	const images: string[] = []
+
+	toolResult.content.forEach((item: any) => {
+		if (item.type === "text") {
+			textParts.push(item.text)
+		} else if (item.type === "image") {
+			if (item.data && item.mimeType) {
+				const validImageTypes = ["image/png", "image/jpeg", "image/gif", "image/webp"]
+				if (validImageTypes.includes(item.mimeType)) {
+					const dataUrl = `data:${item.mimeType};base64,${item.data}`
+					images.push(dataUrl)
+				} else {
+					console.warn(`Unsupported image MIME type: ${item.mimeType}`)
+				}
+			} else {
+				console.warn("Invalid MCP ImageContent: missing data or mimeType")
 			}
-			if (item.type === "resource") {
-				const { blob: _, ...rest } = item.resource
-				return JSON.stringify(rest, null, 2)
-			}
-			return ""
-		})
-		.filter(Boolean)
-		.join("\n\n")
+		} else if (item.type === "resource") {
+			const { blob: _, ...rest } = item.resource
+			textParts.push(JSON.stringify(rest, null, 2))
+		}
+	})
+
+	return {
+		text: textParts.filter(Boolean).join("\n\n"),
+		images,
+	}
 }
 
 async function executeToolAndProcessResult(
@@ -236,11 +251,13 @@ async function executeToolAndProcessResult(
 	const toolResult = await cline.providerRef.deref()?.getMcpHub()?.callTool(serverName, toolName, parsedArguments)
 
 	let toolResultPretty = "(No response)"
+	let images: string[] = []
 
 	if (toolResult) {
-		const outputText = processToolContent(toolResult)
+		const { text: outputText, images: outputImages } = processToolContent(toolResult)
+		images = outputImages
 
-		if (outputText) {
+		if (outputText || images.length > 0) {
 			await sendExecutionStatus(cline, {
 				executionId,
 				status: "output",
@@ -266,8 +283,8 @@ async function executeToolAndProcessResult(
 		})
 	}
 
-	await cline.say("mcp_server_response", toolResultPretty)
-	pushToolResult(formatResponse.toolResult(toolResultPretty))
+	await cline.say("mcp_server_response", toolResultPretty, images)
+	pushToolResult(formatResponse.toolResult(toolResultPretty, images))
 }
 
 export async function useMcpToolTool(
diff --git a/src/shared/__tests__/combineCommandSequences.spec.ts b/src/shared/__tests__/combineCommandSequences.spec.ts
@@ -89,6 +89,48 @@ describe("combineCommandSequences", () => {
 			})
 		})
 
+		it("should preserve images from mcp_server_response messages", () => {
+			const messages: ClineMessage[] = [
+				{
+					type: "ask",
+					ask: "use_mcp_server",
+					text: JSON.stringify({
+						serverName: "test-server",
+						toolName: "test-tool",
+						arguments: { param: "value" },
+					}),
+					ts: 1625097600000,
+				},
+				{
+					type: "say",
+					say: "mcp_server_response",
+					text: "Generated 1 image",
+					images: [
+						"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+					],
+					ts: 1625097601000,
+				},
+			]
+
+			const result = combineCommandSequences(messages)
+
+			expect(result).toHaveLength(1)
+			expect(result[0]).toEqual({
+				type: "ask",
+				ask: "use_mcp_server",
+				text: JSON.stringify({
+					serverName: "test-server",
+					toolName: "test-tool",
+					arguments: { param: "value" },
+					response: "Generated 1 image",
+				}),
+				images: [
+					"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU",
+				],
+				ts: 1625097600000,
+			})
+		})
+
 		it("should handle multiple MCP server requests", () => {
 			const messages: ClineMessage[] = [
 				{
diff --git a/src/shared/combineCommandSequences.ts b/src/shared/combineCommandSequences.ts
@@ -38,11 +38,16 @@ export function combineCommandSequences(messages: ClineMessage[]): ClineMessage[
 		if (msg.type === "ask" && msg.ask === "use_mcp_server") {
 			// Look ahead for MCP responses
 			let responses: string[] = []
+			let allImages: string[] = []
 			let j = i + 1
 
 			while (j < messages.length) {
 				if (messages[j].say === "mcp_server_response") {
 					responses.push(messages[j].text || "")
+					// Collect images from MCP server responses
+					if (messages[j].images && Array.isArray(messages[j].images) && messages[j].images!.length > 0) {
+						allImages.push(...messages[j].images!)
+					}
 					processedIndices.add(j)
 					j++
 				} else if (messages[j].type === "ask" && messages[j].ask === "use_mcp_server") {
@@ -57,13 +62,22 @@ export function combineCommandSequences(messages: ClineMessage[]): ClineMessage[
 				// Parse the JSON from the message text
 				const jsonObj = safeJsonParse<any>(msg.text || "{}", {})
 
-				// Add the response to the JSON object
-				jsonObj.response = responses.join("\n")
+				// Only add non-empty responses
+				const nonEmptyResponses = responses.filter((response) => response.trim())
+				if (nonEmptyResponses.length > 0) {
+					jsonObj.response = nonEmptyResponses.join("\n")
+				}
 
 				// Stringify the updated JSON object
 				const combinedText = JSON.stringify(jsonObj)
 
-				combinedMessages.set(msg.ts, { ...msg, text: combinedText })
+				// Preserve images in the combined message
+				const combinedMessage = { ...msg, text: combinedText }
+				if (allImages.length > 0) {
+					combinedMessage.images = allImages
+				}
+
+				combinedMessages.set(msg.ts, combinedMessage)
 			} else {
 				// If there's no response, just keep the original message
 				combinedMessages.set(msg.ts, { ...msg })
diff --git a/webview-ui/src/components/chat/ChatRow.tsx b/webview-ui/src/components/chat/ChatRow.tsx
@@ -1491,6 +1491,7 @@ export const ChatRowContent = ({
 										server={server}
 										useMcpServer={useMcpServer}
 										alwaysAllowMcp={alwaysAllowMcp}
+										images={message.images}
 									/>
 								)}
 							</div>
diff --git a/webview-ui/src/components/chat/McpExecution.tsx b/webview-ui/src/components/chat/McpExecution.tsx