Skip to content

Commit 2f67908

Browse files
committed
add image upload too to AI Chat
1 parent 5370ae1 commit 2f67908

File tree

7 files changed

+284
-9
lines changed

7 files changed

+284
-9
lines changed

apps/web/client/src/app/project/[id]/_components/right-panel/chat-tab/chat-messages/message-content/tool-call-display.tsx

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import {
88
TERMINAL_COMMAND_TOOL_NAME, TERMINAL_COMMAND_TOOL_PARAMETERS, TODO_WRITE_TOOL_NAME,
99
type TODO_WRITE_TOOL_PARAMETERS,
1010
TYPECHECK_TOOL_NAME,
11+
UPLOAD_IMAGE_TOOL_NAME,
12+
type UPLOAD_IMAGE_TOOL_PARAMETERS,
1113
WEB_SEARCH_TOOL_NAME,
1214
type WEB_SEARCH_TOOL_PARAMETERS,
1315
WRITE_FILE_TOOL_NAME,
@@ -89,6 +91,39 @@ export const ToolCallDisplay = ({
8991
}
9092
}
9193

94+
if (toolName === UPLOAD_IMAGE_TOOL_NAME) {
95+
const args = toolPart.input as z.infer<typeof UPLOAD_IMAGE_TOOL_PARAMETERS> | null;
96+
const result = toolPart.output as string | null;
97+
return (
98+
<div className="flex flex-col gap-2 p-3 border rounded-lg bg-background-secondary">
99+
<div className="flex items-center gap-2">
100+
<Icons.Image className="w-4 h-4" />
101+
<span className="text-sm font-medium">Upload Image</span>
102+
</div>
103+
{args?.image_reference && (
104+
<div className="text-xs text-foreground-secondary">
105+
Image: {args.image_reference}
106+
</div>
107+
)}
108+
{args?.destination_path && (
109+
<div className="text-xs text-foreground-secondary">
110+
Destination: {args.destination_path}
111+
</div>
112+
)}
113+
{args?.filename && (
114+
<div className="text-xs text-foreground-secondary">
115+
Filename: {args.filename}
116+
</div>
117+
)}
118+
{result && (
119+
<div className="text-xs text-green-600 mt-1">
120+
{result}
121+
</div>
122+
)}
123+
</div>
124+
);
125+
}
126+
92127
if (toolName === WRITE_FILE_TOOL_NAME) {
93128
const args = toolPart.input as z.infer<typeof WRITE_FILE_TOOL_PARAMETERS> | null;
94129
const filePath = args?.file_path;

apps/web/client/src/app/project/[id]/_components/right-panel/chat-tab/chat-messages/message-content/tool-call-simple.tsx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import {
2828
TODO_WRITE_TOOL_NAME,
2929
type TODO_WRITE_TOOL_PARAMETERS,
3030
TYPECHECK_TOOL_NAME,
31+
UPLOAD_IMAGE_TOOL_NAME,
32+
type UPLOAD_IMAGE_TOOL_PARAMETERS,
3133
WEB_SEARCH_TOOL_NAME,
3234
type WEB_SEARCH_TOOL_PARAMETERS,
3335
WRITE_FILE_TOOL_NAME,
@@ -60,6 +62,7 @@ const TOOL_ICONS: Record<string, any> = {
6062
[TYPECHECK_TOOL_NAME]: Icons.MagnifyingGlass,
6163
[LIST_BRANCHES_TOOL_NAME]: Icons.Branch,
6264
[GLOB_TOOL_NAME]: Icons.MagnifyingGlass,
65+
[UPLOAD_IMAGE_TOOL_NAME]: Icons.Image,
6366
} as const;
6467

6568
export function ToolCallSimple({
@@ -217,6 +220,14 @@ const getLabel = (toolName: string, toolPart: ToolUIPart) => {
217220
return 'Reading Onlook instructions';
218221
case TYPECHECK_TOOL_NAME:
219222
return 'Checking types';
223+
case UPLOAD_IMAGE_TOOL_NAME: {
224+
const params = toolPart.input as z.infer<typeof UPLOAD_IMAGE_TOOL_PARAMETERS>;
225+
if (params?.image_reference) {
226+
return 'Uploading image ' + truncateString(params.image_reference);
227+
} else {
228+
return 'Uploading image';
229+
}
230+
}
220231
default:
221232
return toolName?.replace(/[-_]/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
222233
}

apps/web/client/src/components/tools/handlers/edit.ts

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@ import {
44
FUZZY_EDIT_FILE_TOOL_PARAMETERS,
55
SEARCH_REPLACE_EDIT_FILE_TOOL_PARAMETERS,
66
SEARCH_REPLACE_MULTI_EDIT_FILE_TOOL_PARAMETERS,
7+
UPLOAD_IMAGE_TOOL_PARAMETERS,
8+
VIEW_IMAGE_TOOL_PARAMETERS,
79
WRITE_FILE_TOOL_PARAMETERS
810
} from '@onlook/ai';
11+
import { MessageContextType } from '@onlook/models';
912
import { z } from 'zod';
1013

1114
export async function handleSearchReplaceEditFileTool(args: z.infer<typeof SEARCH_REPLACE_EDIT_FILE_TOOL_PARAMETERS>, editorEngine: EditorEngine): Promise<string> {
@@ -167,3 +170,168 @@ export async function handleFuzzyEditFileTool(
167170
}
168171
return 'File edited!';
169172
}
173+
174+
export async function handleViewImageTool(args: z.infer<typeof VIEW_IMAGE_TOOL_PARAMETERS>, editorEngine: EditorEngine): Promise<{ image: { mimeType: string; data: string }; message: string }> {
175+
try {
176+
// Find the image in the chat context by reference
177+
const context = editorEngine.chat.context.context;
178+
const imageContext = context.find((ctx) => {
179+
if (ctx.type !== MessageContextType.IMAGE) {
180+
return false;
181+
}
182+
// Try to match by display name, index number, or description
183+
const ref = args.image_reference.toLowerCase();
184+
return ctx.displayName.toLowerCase().includes(ref) ||
185+
ref.includes(ctx.displayName.toLowerCase()) ||
186+
ref.match(/^\d+$/) && context.filter(c => c.type === MessageContextType.IMAGE)[parseInt(ref) - 1] === ctx;
187+
});
188+
189+
if (!imageContext || imageContext.type !== MessageContextType.IMAGE) {
190+
// Try to find by index number
191+
const imageContexts = context.filter(ctx => ctx.type === MessageContextType.IMAGE);
192+
const indexMatch = args.image_reference.match(/^\d+$/);
193+
if (indexMatch) {
194+
const index = parseInt(indexMatch[0]) - 1;
195+
if (index >= 0 && index < imageContexts.length) {
196+
const foundImage = imageContexts[index];
197+
if (foundImage && foundImage.type === MessageContextType.IMAGE) {
198+
return {
199+
image: {
200+
mimeType: foundImage.mimeType,
201+
data: foundImage.content,
202+
},
203+
message: `Retrieved image "${foundImage.displayName}" for analysis.`,
204+
};
205+
}
206+
}
207+
}
208+
209+
throw new Error(`No image found matching reference: ${args.image_reference}`);
210+
}
211+
212+
return {
213+
image: {
214+
mimeType: imageContext.mimeType,
215+
data: imageContext.content,
216+
},
217+
message: `Retrieved image "${imageContext.displayName}" for analysis.`,
218+
};
219+
} catch (error) {
220+
throw new Error(`Cannot view image: ${error}`);
221+
}
222+
}
223+
224+
export async function handleUploadImageTool(args: z.infer<typeof UPLOAD_IMAGE_TOOL_PARAMETERS>, editorEngine: EditorEngine): Promise<string> {
225+
try {
226+
const sandbox = editorEngine.branches.getSandboxById(args.branchId);
227+
if (!sandbox) {
228+
throw new Error(`Sandbox not found for branch ID: ${args.branchId}`);
229+
}
230+
231+
// Find the image in the chat context by reference
232+
const context = editorEngine.chat.context.context;
233+
const imageContext = context.find((ctx) => {
234+
if (ctx.type !== MessageContextType.IMAGE) {
235+
return false;
236+
}
237+
// Try to match by display name or description
238+
return ctx.displayName.toLowerCase().includes(args.image_reference.toLowerCase()) ||
239+
args.image_reference.toLowerCase().includes(ctx.displayName.toLowerCase());
240+
});
241+
242+
if (!imageContext || imageContext.type !== MessageContextType.IMAGE) {
243+
// Try to find the most recent image if no specific match
244+
const recentImages = context.filter(ctx => ctx.type === MessageContextType.IMAGE);
245+
if (recentImages.length === 0) {
246+
throw new Error(`No image found matching reference: ${args.image_reference}`);
247+
}
248+
249+
// Use the most recent image if no specific match
250+
const mostRecentImage = recentImages[recentImages.length - 1];
251+
if (!mostRecentImage || mostRecentImage.type !== MessageContextType.IMAGE) {
252+
throw new Error(`No image found matching reference: ${args.image_reference}`);
253+
}
254+
255+
console.warn(`No exact match for "${args.image_reference}", using most recent image: ${mostRecentImage.displayName}`);
256+
257+
// Extract MIME type and file extension
258+
const mimeType = mostRecentImage.mimeType;
259+
const extension = getExtensionFromMimeType(mimeType);
260+
261+
// Generate filename
262+
const filename = args.filename ? `${args.filename}.${extension}` : `${generateUUID()}.${extension}`;
263+
264+
// Determine destination path
265+
const destinationPath = args.destination_path || 'public/assets/images';
266+
const fullPath = `${destinationPath}/${filename}`;
267+
268+
// Convert base64 to binary data
269+
const base64Data = mostRecentImage.content.replace(/^data:image\/[a-z]+;base64,/, '');
270+
const binaryData = base64ToUint8Array(base64Data);
271+
272+
// Upload to sandbox
273+
await sandbox.writeBinaryFile(fullPath, binaryData);
274+
275+
// Refresh image scanning to update the UI
276+
await editorEngine.image.scanImages();
277+
278+
return `Image "${mostRecentImage.displayName}" uploaded successfully to ${fullPath}`;
279+
}
280+
281+
// Extract MIME type and file extension
282+
const mimeType = imageContext.mimeType;
283+
const extension = getExtensionFromMimeType(mimeType);
284+
285+
// Generate filename
286+
const filename = args.filename ? `${args.filename}.${extension}` : `${generateUUID()}.${extension}`;
287+
288+
// Determine destination path
289+
const destinationPath = args.destination_path || 'public/assets/images';
290+
const fullPath = `${destinationPath}/${filename}`;
291+
292+
// Convert base64 to binary data
293+
const base64Data = imageContext.content.replace(/^data:image\/[a-z]+;base64,/, '');
294+
const binaryData = base64ToUint8Array(base64Data);
295+
296+
// Upload to sandbox
297+
await sandbox.writeBinaryFile(fullPath, binaryData);
298+
299+
// Refresh image scanning to update the UI
300+
await editorEngine.image.scanImages();
301+
302+
return `Image "${imageContext.displayName}" uploaded successfully to ${fullPath}`;
303+
} catch (error) {
304+
throw new Error(`Cannot upload image: ${error}`);
305+
}
306+
}
307+
308+
function getExtensionFromMimeType(mimeType: string): string {
309+
const mimeToExt: Record<string, string> = {
310+
'image/jpeg': 'jpg',
311+
'image/jpg': 'jpg',
312+
'image/png': 'png',
313+
'image/gif': 'gif',
314+
'image/webp': 'webp',
315+
'image/svg+xml': 'svg',
316+
'image/bmp': 'bmp',
317+
'image/tiff': 'tiff',
318+
};
319+
return mimeToExt[mimeType.toLowerCase()] || 'png';
320+
}
321+
322+
function base64ToUint8Array(base64: string): Uint8Array {
323+
const binaryString = atob(base64);
324+
const bytes = new Uint8Array(binaryString.length);
325+
for (let i = 0; i < binaryString.length; i++) {
326+
bytes[i] = binaryString.charCodeAt(i);
327+
}
328+
return bytes;
329+
}
330+
331+
function generateUUID(): string {
332+
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
333+
const r = Math.random() * 16 | 0;
334+
const v = c === 'x' ? r : (r & 0x3 | 0x8);
335+
return v.toString(16);
336+
});
337+
}

apps/web/client/src/components/tools/tools.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ import {
3232
TERMINAL_COMMAND_TOOL_PARAMETERS,
3333
TYPECHECK_TOOL_NAME,
3434
TYPECHECK_TOOL_PARAMETERS,
35+
UPLOAD_IMAGE_TOOL_NAME,
36+
UPLOAD_IMAGE_TOOL_PARAMETERS,
37+
VIEW_IMAGE_TOOL_NAME,
38+
VIEW_IMAGE_TOOL_PARAMETERS,
3539
WEB_SEARCH_TOOL_NAME,
3640
WEB_SEARCH_TOOL_PARAMETERS,
3741
WRITE_FILE_TOOL_NAME,
@@ -56,6 +60,8 @@ import {
5660
handleSearchReplaceMultiEditFileTool,
5761
handleTerminalCommandTool,
5862
handleTypecheckTool,
63+
handleUploadImageTool,
64+
handleViewImageTool,
5965
handleWebSearchTool,
6066
handleWriteFileTool
6167
} from './handlers';
@@ -181,6 +187,18 @@ const TOOL_HANDLERS: ClientToolMap = {
181187
handler: async (args: z.infer<typeof CHECK_ERRORS_TOOL_PARAMETERS>, editorEngine: EditorEngine) =>
182188
handleCheckErrors(args, editorEngine),
183189
},
190+
[VIEW_IMAGE_TOOL_NAME]: {
191+
name: VIEW_IMAGE_TOOL_NAME,
192+
inputSchema: VIEW_IMAGE_TOOL_PARAMETERS,
193+
handler: async (args: z.infer<typeof VIEW_IMAGE_TOOL_PARAMETERS>, editorEngine: EditorEngine) =>
194+
handleViewImageTool(args, editorEngine),
195+
},
196+
[UPLOAD_IMAGE_TOOL_NAME]: {
197+
name: UPLOAD_IMAGE_TOOL_NAME,
198+
inputSchema: UPLOAD_IMAGE_TOOL_PARAMETERS,
199+
handler: async (args: z.infer<typeof UPLOAD_IMAGE_TOOL_PARAMETERS>, editorEngine: EditorEngine) =>
200+
handleUploadImageTool(args, editorEngine),
201+
},
184202
};
185203

186204
export async function handleToolCall(toolCall: ToolCall<string, unknown>, editorEngine: EditorEngine, addToolResult: (toolResult: { tool: string, toolCallId: string, output: any }) => Promise<void>) {

packages/ai/src/prompt/provider.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import type {
66
MessageContext,
77
ProjectMessageContext,
88
} from '@onlook/models';
9-
import type { FileUIPart } from 'ai';
109
import { ASK_MODE_SYSTEM_PROMPT } from './ask';
1110
import { CONTEXT_PROMPTS } from './context';
1211
import { CREATE_NEW_PAGE_SYSTEM_PROMPT } from './create';
@@ -111,17 +110,18 @@ export function getHydratedUserMessage(
111110
.join('\n');
112111
prompt += wrapXml('instruction', textContent);
113112

114-
userParts.push({ type: 'text', text: prompt });
115-
113+
// Add image references to prompt (but don't send image data yet)
114+
// AI will decide whether to view or upload them using tools
116115
if (images.length > 0) {
117-
const attachments: FileUIPart[] = images.map((i) => ({
118-
type: 'file',
119-
mediaType: i.mimeType,
120-
url: i.content,
121-
}));
122-
userParts = userParts.concat(attachments);
116+
const imageList = images
117+
.map((img, idx) => `${idx + 1}. "${img.displayName}" (${img.mimeType})`)
118+
.join('\n');
119+
const imagesPrompt = `The user has attached ${images.length} image(s) to this message:\n${imageList}\n\nYou can:\n- Use the "view_image" tool to analyze the image content\n- Use the "upload_image" tool to save it to the project\n\nDetermine the appropriate action based on the user's request.`;
120+
prompt += wrapXml('available-images', imagesPrompt);
123121
}
124122

123+
userParts.push({ type: 'text', text: prompt });
124+
125125
return {
126126
id,
127127
role: 'user',

packages/ai/src/tools/tools/edit.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,40 @@ export const fuzzyEditFileTool = tool({
7373
'Edit the contents of a file with fuzzy matching instead of search and replace. This should be used as a fallback when the search and replace tool fails. It calls another agent to do the actual editing.',
7474
inputSchema: FUZZY_EDIT_FILE_TOOL_PARAMETERS,
7575
});
76+
77+
export const VIEW_IMAGE_TOOL_NAME = 'view_image';
78+
export const VIEW_IMAGE_TOOL_PARAMETERS = z.object({
79+
image_reference: z
80+
.string()
81+
.describe(
82+
'Reference to an image in the chat context (use the display name or index number)',
83+
),
84+
});
85+
export const viewImageTool = tool({
86+
description:
87+
"Retrieves and views an image from the chat context for analysis. Use this tool when the user asks you to analyze, describe, or work with an image they've attached. The image data will be returned so you can see and analyze its contents. This does NOT save the image to the project.",
88+
inputSchema: VIEW_IMAGE_TOOL_PARAMETERS,
89+
});
90+
91+
export const UPLOAD_IMAGE_TOOL_NAME = 'upload_image';
92+
export const UPLOAD_IMAGE_TOOL_PARAMETERS = z.object({
93+
image_reference: z
94+
.string()
95+
.describe(
96+
'Reference to an image in the chat context (use the display name or index number)',
97+
),
98+
destination_path: z
99+
.string()
100+
.optional()
101+
.describe('Destination path within the project (default: "public/assets/images")'),
102+
filename: z
103+
.string()
104+
.optional()
105+
.describe('Custom filename (without extension). If not provided, a UUID will be generated'),
106+
branchId: BRANCH_ID_SCHEMA,
107+
});
108+
export const uploadImageTool = tool({
109+
description:
110+
"Uploads an image from the chat context to the project's file system. Use this tool when the user asks you to save, add, or upload an image to their project. The image will be stored in the project's public directory and can be referenced in code. After uploading, you can use the file path in your code changes.",
111+
inputSchema: UPLOAD_IMAGE_TOOL_PARAMETERS,
112+
});

packages/ai/src/tools/toolset.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ import {
3333
terminalCommandTool,
3434
TYPECHECK_TOOL_NAME,
3535
typecheckTool,
36+
UPLOAD_IMAGE_TOOL_NAME,
37+
uploadImageTool,
38+
VIEW_IMAGE_TOOL_NAME,
39+
viewImageTool,
3640
WEB_SEARCH_TOOL_NAME,
3741
webSearchTool,
3842
WRITE_FILE_TOOL_NAME,
@@ -62,6 +66,8 @@ export const BUILD_TOOL_SET: ToolSet = {
6266
[SANDBOX_TOOL_NAME]: sandboxTool,
6367
[TERMINAL_COMMAND_TOOL_NAME]: terminalCommandTool,
6468
[TYPECHECK_TOOL_NAME]: typecheckTool,
69+
[VIEW_IMAGE_TOOL_NAME]: viewImageTool,
70+
[UPLOAD_IMAGE_TOOL_NAME]: uploadImageTool,
6571
};
6672

6773
export type ChatTools = InferUITools<typeof BUILD_TOOL_SET>;

0 commit comments

Comments
 (0)