Skip to content

Commit 61fc391

Browse files
authored
fix: preserve user images in native tool call results (#9401)
1 parent 18c4d1a commit 61fc391

File tree

2 files changed

+223
-13
lines changed

2 files changed

+223
-13
lines changed
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
// npx vitest src/core/assistant-message/__tests__/presentAssistantMessage-images.spec.ts
2+
3+
import { describe, it, expect, beforeEach, vi } from "vitest"
4+
import { Anthropic } from "@anthropic-ai/sdk"
5+
import { presentAssistantMessage } from "../presentAssistantMessage"
6+
import { Task } from "../../task/Task"
7+
import { TOOL_PROTOCOL } from "@roo-code/types"
8+
9+
// Mock dependencies
10+
vi.mock("../../task/Task")
11+
vi.mock("../../tools/validateToolUse", () => ({
12+
validateToolUse: vi.fn(),
13+
}))
14+
vi.mock("@roo-code/telemetry", () => ({
15+
TelemetryService: {
16+
instance: {
17+
captureToolUsage: vi.fn(),
18+
captureConsecutiveMistakeError: vi.fn(),
19+
},
20+
},
21+
}))
22+
23+
describe("presentAssistantMessage - Image Handling in Native Tool Calls", () => {
24+
let mockTask: any
25+
26+
beforeEach(() => {
27+
// Create a mock Task with minimal properties needed for testing
28+
mockTask = {
29+
taskId: "test-task-id",
30+
instanceId: "test-instance",
31+
abort: false,
32+
presentAssistantMessageLocked: false,
33+
presentAssistantMessageHasPendingUpdates: false,
34+
currentStreamingContentIndex: 0,
35+
assistantMessageContent: [],
36+
userMessageContent: [],
37+
didCompleteReadingStream: false,
38+
didRejectTool: false,
39+
didAlreadyUseTool: false,
40+
diffEnabled: false,
41+
consecutiveMistakeCount: 0,
42+
api: {
43+
getModel: () => ({ id: "test-model", info: {} }),
44+
},
45+
browserSession: {
46+
closeBrowser: vi.fn().mockResolvedValue(undefined),
47+
},
48+
recordToolUsage: vi.fn(),
49+
toolRepetitionDetector: {
50+
check: vi.fn().mockReturnValue({ allowExecution: true }),
51+
},
52+
providerRef: {
53+
deref: () => ({
54+
getState: vi.fn().mockResolvedValue({
55+
mode: "code",
56+
customModes: [],
57+
}),
58+
}),
59+
},
60+
say: vi.fn().mockResolvedValue(undefined),
61+
ask: vi.fn().mockResolvedValue({ response: "yesButtonClicked" }),
62+
}
63+
})
64+
65+
it("should preserve images in tool_result for native protocol", async () => {
66+
// Set up a tool_use block with an ID (indicates native protocol)
67+
const toolCallId = "tool_call_123"
68+
mockTask.assistantMessageContent = [
69+
{
70+
type: "tool_use",
71+
id: toolCallId, // ID indicates native protocol
72+
name: "ask_followup_question",
73+
params: { question: "What do you see?" },
74+
},
75+
]
76+
77+
// Create a mock askApproval that includes images in the response
78+
const imageBlock: Anthropic.ImageBlockParam = {
79+
type: "image",
80+
source: {
81+
type: "base64",
82+
media_type: "image/png",
83+
data: "base64ImageData",
84+
},
85+
}
86+
87+
mockTask.ask = vi.fn().mockResolvedValue({
88+
response: "yesButtonClicked",
89+
text: "I see a cat",
90+
images: [""],
91+
})
92+
93+
// Execute presentAssistantMessage
94+
await presentAssistantMessage(mockTask)
95+
96+
// Verify that userMessageContent was populated
97+
expect(mockTask.userMessageContent.length).toBeGreaterThan(0)
98+
99+
// Find the tool_result block
100+
const toolResult = mockTask.userMessageContent.find(
101+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
102+
)
103+
104+
expect(toolResult).toBeDefined()
105+
expect(toolResult.tool_use_id).toBe(toolCallId)
106+
107+
// For native protocol, tool_result content should be a string (text only)
108+
expect(typeof toolResult.content).toBe("string")
109+
expect(toolResult.content).toContain("I see a cat")
110+
111+
// Images should be added as separate blocks AFTER the tool_result
112+
const imageBlocks = mockTask.userMessageContent.filter((item: any) => item.type === "image")
113+
expect(imageBlocks.length).toBeGreaterThan(0)
114+
expect(imageBlocks[0].source.data).toBe("base64ImageData")
115+
})
116+
117+
it("should convert to string when no images are present (native protocol)", async () => {
118+
// Set up a tool_use block with an ID (indicates native protocol)
119+
const toolCallId = "tool_call_456"
120+
mockTask.assistantMessageContent = [
121+
{
122+
type: "tool_use",
123+
id: toolCallId,
124+
name: "ask_followup_question",
125+
params: { question: "What is your name?" },
126+
},
127+
]
128+
129+
// Response with text but NO images
130+
mockTask.ask = vi.fn().mockResolvedValue({
131+
response: "yesButtonClicked",
132+
text: "My name is Alice",
133+
images: undefined,
134+
})
135+
136+
await presentAssistantMessage(mockTask)
137+
138+
const toolResult = mockTask.userMessageContent.find(
139+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
140+
)
141+
142+
expect(toolResult).toBeDefined()
143+
144+
// When no images, content should be a string
145+
expect(typeof toolResult.content).toBe("string")
146+
})
147+
148+
it("should preserve images in content array for XML protocol (existing behavior)", async () => {
149+
// Set up a tool_use block WITHOUT an ID (indicates XML protocol)
150+
mockTask.assistantMessageContent = [
151+
{
152+
type: "tool_use",
153+
// No ID = XML protocol
154+
name: "ask_followup_question",
155+
params: { question: "What do you see?" },
156+
},
157+
]
158+
159+
mockTask.ask = vi.fn().mockResolvedValue({
160+
response: "yesButtonClicked",
161+
text: "I see a dog",
162+
images: [""],
163+
})
164+
165+
await presentAssistantMessage(mockTask)
166+
167+
// For XML protocol, content is added as separate blocks
168+
// Check that both text and image blocks were added
169+
const hasTextBlock = mockTask.userMessageContent.some((item: any) => item.type === "text")
170+
const hasImageBlock = mockTask.userMessageContent.some((item: any) => item.type === "image")
171+
172+
expect(hasTextBlock).toBe(true)
173+
// XML protocol preserves images as separate blocks in userMessageContent
174+
expect(hasImageBlock).toBe(true)
175+
})
176+
177+
it("should handle empty tool result gracefully", async () => {
178+
const toolCallId = "tool_call_789"
179+
mockTask.assistantMessageContent = [
180+
{
181+
type: "tool_use",
182+
id: toolCallId,
183+
name: "attempt_completion",
184+
params: { result: "Task completed" },
185+
},
186+
]
187+
188+
// Empty response
189+
mockTask.ask = vi.fn().mockResolvedValue({
190+
response: "yesButtonClicked",
191+
text: undefined,
192+
images: undefined,
193+
})
194+
195+
await presentAssistantMessage(mockTask)
196+
197+
const toolResult = mockTask.userMessageContent.find(
198+
(item: any) => item.type === "tool_result" && item.tool_use_id === toolCallId,
199+
)
200+
201+
expect(toolResult).toBeDefined()
202+
// Should have fallback text
203+
expect(toolResult.content).toBeTruthy()
204+
})
205+
})

src/core/assistant-message/presentAssistantMessage.ts

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -296,31 +296,36 @@ export async function presentAssistantMessage(cline: Task) {
296296
return
297297
}
298298

299-
// For native protocol, add as tool_result block
299+
// For native protocol, tool_result content must be a string
300+
// Images are added as separate blocks in the user message
300301
let resultContent: string
302+
let imageBlocks: Anthropic.ImageBlockParam[] = []
303+
301304
if (typeof content === "string") {
302305
resultContent = content || "(tool did not return anything)"
303306
} else {
304-
// Convert array of content blocks to string for tool result
305-
// Tool results in OpenAI format only support strings
306-
resultContent = content
307-
.map((item) => {
308-
if (item.type === "text") {
309-
return item.text
310-
} else if (item.type === "image") {
311-
return "(image content)"
312-
}
313-
return ""
314-
})
315-
.join("\n")
307+
// Separate text and image blocks
308+
const textBlocks = content.filter((item) => item.type === "text")
309+
imageBlocks = content.filter((item) => item.type === "image") as Anthropic.ImageBlockParam[]
310+
311+
// Convert text blocks to string for tool_result
312+
resultContent =
313+
textBlocks.map((item) => (item as Anthropic.TextBlockParam).text).join("\n") ||
314+
"(tool did not return anything)"
316315
}
317316

317+
// Add tool_result with text content only
318318
cline.userMessageContent.push({
319319
type: "tool_result",
320320
tool_use_id: toolCallId,
321321
content: resultContent,
322322
} as Anthropic.ToolResultBlockParam)
323323

324+
// Add image blocks separately after tool_result
325+
if (imageBlocks.length > 0) {
326+
cline.userMessageContent.push(...imageBlocks)
327+
}
328+
324329
hasToolResult = true
325330
} else {
326331
// For XML protocol, add as text blocks (legacy behavior)

0 commit comments

Comments
 (0)