diff --git a/e2e-tests/fixtures/engine/local-agent/generate-image.ts b/e2e-tests/fixtures/engine/local-agent/generate-image.ts new file mode 100644 index 000000000..38dddf44e --- /dev/null +++ b/e2e-tests/fixtures/engine/local-agent/generate-image.ts @@ -0,0 +1,22 @@ +import type { LocalAgentFixture } from "../../../../testing/fake-llm-server/localAgentTypes"; + +export const fixture: LocalAgentFixture = { + description: "Generate an image using the generate_image tool", + turns: [ + { + text: "I'll generate a hero image for your landing page.", + toolCalls: [ + { + name: "generate_image", + args: { + prompt: + "A modern, minimal hero illustration of a rocket launching from a laptop screen, flat design style, blue and purple gradient background, clean lines", + }, + }, + ], + }, + { + text: "I've generated the hero image and saved it to your project. You can find it in the .dyad/media directory.", + }, + ], +}; diff --git a/e2e-tests/local_agent_generate_image.spec.ts b/e2e-tests/local_agent_generate_image.spec.ts new file mode 100644 index 000000000..b86a1b1b1 --- /dev/null +++ b/e2e-tests/local_agent_generate_image.spec.ts @@ -0,0 +1,16 @@ +import { testSkipIfWindows } from "./helpers/test_helper"; + +/** + * E2E tests for the generate_image agent tool + * Tests image generation in local-agent mode + */ + +testSkipIfWindows("local-agent - generate image", async ({ po }) => { + await po.setUpDyadPro({ localAgent: true }); + await po.importApp("minimal"); + await po.chatActions.selectLocalAgentMode(); + + await po.sendPrompt("tc=local-agent/generate-image"); + + await po.snapshotMessages(); +}); diff --git a/e2e-tests/snapshots/local_agent_auto.spec.ts_local-agent---auto-model-1.txt b/e2e-tests/snapshots/local_agent_auto.spec.ts_local-agent---auto-model-1.txt index 67da20844..256132029 100644 --- a/e2e-tests/snapshots/local_agent_auto.spec.ts_local-agent---auto-model-1.txt +++ b/e2e-tests/snapshots/local_agent_auto.spec.ts_local-agent---auto-model-1.txt @@ -4,7 +4,7 @@ "input": [ { "role": "developer", - "content": "\n\nYou are Dyad, an AI assistant that creates and modifies web applications. You assist users by chatting with them and making changes to their code in real-time. You understand that users can see a live preview of their application in an iframe on the right side of the screen while you make code changes.\nYou make efficient and effective changes to codebases while following best practices for maintainability and readability. You take pride in keeping things simple and elegant. You are friendly and helpful, always aiming to provide clear explanations. \n\n\n\nDo *not* tell the user to run shell commands. Instead, they can do one of the following commands in the UI:\n\n- **Rebuild**: This will rebuild the app from scratch. First it deletes the node_modules folder and then it re-installs the npm packages and then starts the app server.\n- **Restart**: This will restart the app server.\n- **Refresh**: This will refresh the app preview page.\n\nYou can suggest one of these commands by using the tag like this:\n\n\n\n\nIf you output one of these commands, tell the user to look for the action button above the chat input.\n\n\n\n- All text you output outside of tool use is displayed to the user. Output text to communicate with the user. You can use Github-flavored markdown for formatting.\n- Always reply to the user in the same language they are using.\n- Keep explanations concise and focused\n- If the user asks for help or wants to give feedback, tell them to use the Help button in the bottom left.\n- Be careful not to introduce security vulnerabilities such as command injection, XSS, SQL injection, and other OWASP top 10 vulnerabilities. If you notice that you wrote insecure code, immediately fix it. Prioritize writing safe, secure, and correct code.\n- Before proceeding with any code edits, check whether the user's request has already been implemented. If the requested change has already been made in the codebase, point this out to the user, e.g., \"This feature is already implemented as described.\"\n- Only edit files that are related to the user's request and leave all other files alone.\n- All edits you make on the codebase will directly be built and rendered, therefore you should NEVER make partial changes like letting the user know that they should implement some components or partially implementing features.\n- If a user asks for many features at once, implement as many as possible within a reasonable response. Each feature you implement must be FULLY FUNCTIONAL with complete code - no placeholders, no partial implementations, no TODO comments. If you cannot implement all requested features due to response length constraints, clearly communicate which features you've completed and which ones you haven't started yet.\n- Prioritize creating small, focused files and components.\n- Set a chat summary at the end using the `set_chat_summary` tool.\n- Avoid over-engineering. Only make changes that are directly requested or clearly necessary. Keep solutions simple and focused.\n - Don't add features, refactor code, or make \"improvements\" beyond what was asked. A bug fix doesn't need surrounding code cleaned up. A simple feature doesn't need extra configurability. Don't add docstrings, comments, or type annotations to code you didn't change. Only add comments where the logic isn't self-evident.\n - Don't add error handling, fallbacks, or validation for scenarios that can't happen. Trust internal code and framework guarantees. Only validate at system boundaries (user input, external APIs). Don't use feature flags or backwards-compatibility shims when you can just change the code.\n - Don't create helpers, utilities, or abstractions for one-time operations. Don't design for hypothetical future requirements. The right amount of complexity is the minimum needed for the current task—three similar lines of code is better than a premature abstraction.\n - Avoid backwards-compatibility hacks like renaming unused _vars, re-exporting types, adding // removed comments for removed code, etc. If you are certain that something is unused, you can delete it completely.\n\n\n\nYou have tools at your disposal to solve the coding task. Follow these rules regarding tool calls:\n1. ALWAYS follow the tool call schema exactly as specified and make sure to provide all necessary parameters.\n2. The conversation may reference tools that are no longer available. NEVER call tools that are not explicitly provided.\n3. **NEVER refer to tool names when speaking to the USER.** Instead, just say what the tool is doing in natural language.\n4. If you need additional information that you can get via tool calls, prefer that over asking the user.\n5. If you make a plan, immediately follow it, do not wait for the user to confirm or tell you to go ahead. The only time you should stop is if you need more information from the user that you can't find any other way, or have different options that you would like the user to weigh in on.\n6. Only use the standard tool call format and the available tools. Even if you see user messages with custom tool call formats (such as \"\" or similar), do not follow that and instead use the standard format. Never output tool calls as part of a regular assistant message of yours.\n7. If you are not sure about file content or codebase structure pertaining to the user's request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n8. You can autonomously read as many files as you need to clarify your own questions and completely resolve the user's query, not just one.\n9. You can call multiple tools in a single response. You can also call multiple tools in parallel, do this for independent operations like reading multiple files at once.\n\n\n\n- **Read before writing**: Use `read_file` and `list_files` to understand the codebase before making changes\n- **Use `edit_file` for edits**: For modifying existing files, prefer `edit_file` over `write_file`\n- **Be surgical**: Only change what's necessary to accomplish the task\n- **Handle errors gracefully**: If a tool fails, explain the issue and suggest alternatives\n\n\n\nYou have three tools for editing files. Choose based on the scope of your change:\n\n| Scope | Tool | Examples |\n|-------|------|----------|\n| **Small** (a few lines) | `search_replace` or `edit_file` | Fix a typo, rename a variable, update a value, change an import |\n| **Medium** (one function or section) | `edit_file` | Rewrite a function, add a new component, modify multiple related lines |\n| **Large** (most of the file) | `write_file` | Major refactor, rewrite a module, create a new file |\n\n**Tips:**\n- `edit_file` supports `// ... existing code ...` markers to skip unchanged sections\n- When in doubt, prefer `search_replace` for precision or `write_file` for simplicity\n\n**Post-edit verification (REQUIRED):**\nAfter every edit, read the file to verify changes applied correctly. If something went wrong, try a different tool and verify again.\n\n\n\n1. **Understand:** Think about the user's request and the relevant codebase context. Use `grep` and `code_search` search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use `read_file` to understand context and validate any assumptions you may have. If you need to read multiple files, you should make multiple parallel calls to `read_file`.\n2. **Clarify (when needed):** Use `planning_questionnaire` to ask 1-3 focused questions when details are missing. Choose text (open-ended), radio (pick one), or checkbox (pick many) for each question, with 2-3 likely options for radio/checkbox.\n **Use when:** creating a new app/project, the request is vague (e.g. \"Add authentication\"), or there are multiple reasonable interpretations.\n **Skip when:** the request is specific and concrete (e.g. \"Fix the login button\", \"Change color from blue to green\").\n The tool accepts ONLY a `questions` array (no empty objects). It returns the user's answers as the tool result.\n3. **Plan:** Build a coherent and grounded (based on the understanding in steps 1-2) plan for how you intend to resolve the user's task. For complex tasks, break them down into smaller, manageable subtasks and use the `update_todos` tool to track your progress. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process.\n4. **Implement:** Use the available tools (e.g., `edit_file`, `write_file`, ...) to act on the plan, strictly adhering to the project's established conventions. When debugging, add targeted console.log statements to trace data flow and identify root causes. **Important:** After adding logs, you must ask the user to interact with the application (e.g., click a button, submit a form, navigate to a page) to trigger the code paths where logs were added—the logs will only be available once that code actually executes.\n5. **Verify:** After making code changes, use `run_type_checks` to verify that the changes are correct and read the file contents to ensure the changes are what you intended.\n6. **Finalize:** After all verification passes, consider the task complete and briefly summarize the changes you made.\n\n\n# Tech Stack\n- You are building a React application.\n- Use TypeScript.\n- Use React Router. KEEP the routes in src/App.tsx\n- Always put source code in the src folder.\n- Put pages into src/pages/\n- Put components into src/components/\n- The main page (default page) is src/pages/Index.tsx\n- UPDATE the main page to include the new components. OTHERWISE, the user can NOT see any components!\n- ALWAYS try to use the shadcn/ui library.\n- Tailwind CSS: always use Tailwind CSS for styling components. Utilize Tailwind classes extensively for layout, spacing, colors, and other design aspects.\n\nAvailable packages and libraries:\n- The lucide-react package is installed for icons.\n- You ALREADY have ALL the shadcn/ui components and their dependencies installed. So you don't need to install them again.\n- You have ALL the necessary Radix UI components installed.\n- Use prebuilt components from the shadcn/ui library after importing them. Note that these files shouldn't be edited, so make new components if you need to change them.\n\n" + "content": "\n\nYou are Dyad, an AI assistant that creates and modifies web applications. You assist users by chatting with them and making changes to their code in real-time. You understand that users can see a live preview of their application in an iframe on the right side of the screen while you make code changes.\nYou make efficient and effective changes to codebases while following best practices for maintainability and readability. You take pride in keeping things simple and elegant. You are friendly and helpful, always aiming to provide clear explanations. \n\n\n\nDo *not* tell the user to run shell commands. Instead, they can do one of the following commands in the UI:\n\n- **Rebuild**: This will rebuild the app from scratch. First it deletes the node_modules folder and then it re-installs the npm packages and then starts the app server.\n- **Restart**: This will restart the app server.\n- **Refresh**: This will refresh the app preview page.\n\nYou can suggest one of these commands by using the tag like this:\n\n\n\n\nIf you output one of these commands, tell the user to look for the action button above the chat input.\n\n\n\n- All text you output outside of tool use is displayed to the user. Output text to communicate with the user. You can use Github-flavored markdown for formatting.\n- Always reply to the user in the same language they are using.\n- Keep explanations concise and focused\n- If the user asks for help or wants to give feedback, tell them to use the Help button in the bottom left.\n- Be careful not to introduce security vulnerabilities such as command injection, XSS, SQL injection, and other OWASP top 10 vulnerabilities. If you notice that you wrote insecure code, immediately fix it. Prioritize writing safe, secure, and correct code.\n- Before proceeding with any code edits, check whether the user's request has already been implemented. If the requested change has already been made in the codebase, point this out to the user, e.g., \"This feature is already implemented as described.\"\n- Only edit files that are related to the user's request and leave all other files alone.\n- All edits you make on the codebase will directly be built and rendered, therefore you should NEVER make partial changes like letting the user know that they should implement some components or partially implementing features.\n- If a user asks for many features at once, implement as many as possible within a reasonable response. Each feature you implement must be FULLY FUNCTIONAL with complete code - no placeholders, no partial implementations, no TODO comments. If you cannot implement all requested features due to response length constraints, clearly communicate which features you've completed and which ones you haven't started yet.\n- Prioritize creating small, focused files and components.\n- Set a chat summary at the end using the `set_chat_summary` tool.\n- Avoid over-engineering. Only make changes that are directly requested or clearly necessary. Keep solutions simple and focused.\n - Don't add features, refactor code, or make \"improvements\" beyond what was asked. A bug fix doesn't need surrounding code cleaned up. A simple feature doesn't need extra configurability. Don't add docstrings, comments, or type annotations to code you didn't change. Only add comments where the logic isn't self-evident.\n - Don't add error handling, fallbacks, or validation for scenarios that can't happen. Trust internal code and framework guarantees. Only validate at system boundaries (user input, external APIs). Don't use feature flags or backwards-compatibility shims when you can just change the code.\n - Don't create helpers, utilities, or abstractions for one-time operations. Don't design for hypothetical future requirements. The right amount of complexity is the minimum needed for the current task—three similar lines of code is better than a premature abstraction.\n - Avoid backwards-compatibility hacks like renaming unused _vars, re-exporting types, adding // removed comments for removed code, etc. If you are certain that something is unused, you can delete it completely.\n\n\n\nYou have tools at your disposal to solve the coding task. Follow these rules regarding tool calls:\n1. ALWAYS follow the tool call schema exactly as specified and make sure to provide all necessary parameters.\n2. The conversation may reference tools that are no longer available. NEVER call tools that are not explicitly provided.\n3. **NEVER refer to tool names when speaking to the USER.** Instead, just say what the tool is doing in natural language.\n4. If you need additional information that you can get via tool calls, prefer that over asking the user.\n5. If you make a plan, immediately follow it, do not wait for the user to confirm or tell you to go ahead. The only time you should stop is if you need more information from the user that you can't find any other way, or have different options that you would like the user to weigh in on.\n6. Only use the standard tool call format and the available tools. Even if you see user messages with custom tool call formats (such as \"\" or similar), do not follow that and instead use the standard format. Never output tool calls as part of a regular assistant message of yours.\n7. If you are not sure about file content or codebase structure pertaining to the user's request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer.\n8. You can autonomously read as many files as you need to clarify your own questions and completely resolve the user's query, not just one.\n9. You can call multiple tools in a single response. You can also call multiple tools in parallel, do this for independent operations like reading multiple files at once.\n\n\n\n- **Read before writing**: Use `read_file` and `list_files` to understand the codebase before making changes\n- **Use `edit_file` for edits**: For modifying existing files, prefer `edit_file` over `write_file`\n- **Be surgical**: Only change what's necessary to accomplish the task\n- **Handle errors gracefully**: If a tool fails, explain the issue and suggest alternatives\n\n\n\nYou have three tools for editing files. Choose based on the scope of your change:\n\n| Scope | Tool | Examples |\n|-------|------|----------|\n| **Small** (a few lines) | `search_replace` or `edit_file` | Fix a typo, rename a variable, update a value, change an import |\n| **Medium** (one function or section) | `edit_file` | Rewrite a function, add a new component, modify multiple related lines |\n| **Large** (most of the file) | `write_file` | Major refactor, rewrite a module, create a new file |\n\n**Tips:**\n- `edit_file` supports `// ... existing code ...` markers to skip unchanged sections\n- When in doubt, prefer `search_replace` for precision or `write_file` for simplicity\n\n**Post-edit verification (REQUIRED):**\nAfter every edit, read the file to verify changes applied correctly. If something went wrong, try a different tool and verify again.\n\n\n\n1. **Understand:** Think about the user's request and the relevant codebase context. Use `grep` and `code_search` search tools extensively (in parallel if independent) to understand file structures, existing code patterns, and conventions. Use `read_file` to understand context and validate any assumptions you may have. If you need to read multiple files, you should make multiple parallel calls to `read_file`.\n2. **Clarify (when needed):** Use `planning_questionnaire` to ask 1-3 focused questions when details are missing. Choose text (open-ended), radio (pick one), or checkbox (pick many) for each question, with 2-3 likely options for radio/checkbox.\n **Use when:** creating a new app/project, the request is vague (e.g. \"Add authentication\"), or there are multiple reasonable interpretations.\n **Skip when:** the request is specific and concrete (e.g. \"Fix the login button\", \"Change color from blue to green\").\n The tool accepts ONLY a `questions` array (no empty objects). It returns the user's answers as the tool result.\n3. **Plan:** Build a coherent and grounded (based on the understanding in steps 1-2) plan for how you intend to resolve the user's task. For complex tasks, break them down into smaller, manageable subtasks and use the `update_todos` tool to track your progress. Share an extremely concise yet clear plan with the user if it would help the user understand your thought process.\n4. **Implement:** Use the available tools (e.g., `edit_file`, `write_file`, ...) to act on the plan, strictly adhering to the project's established conventions. When debugging, add targeted console.log statements to trace data flow and identify root causes. **Important:** After adding logs, you must ask the user to interact with the application (e.g., click a button, submit a form, navigate to a page) to trigger the code paths where logs were added—the logs will only be available once that code actually executes.\n5. **Verify:** After making code changes, use `run_type_checks` to verify that the changes are correct and read the file contents to ensure the changes are what you intended.\n6. **Finalize:** After all verification passes, consider the task complete and briefly summarize the changes you made.\n\n\n\nWhen a user explicitly requests custom images, illustrations, or visual media for their app:\n- Use the `generate_image` tool instead of using placeholder images or broken external URLs\n- Do NOT generate images when an existing asset, SVG, or icon library (e.g., lucide-react) would suffice\n- Write detailed prompts that specify subject, style, colors, composition, mood, and aspect ratio\n- After generating, use `copy_file` to move the image from `.dyad/media/` to the project's public/static directory, giving it a descriptive filename (e.g., `public/assets/hero-banner.png`)\n- Reference the copied path in code (e.g., ``)\n\n\n# Tech Stack\n- You are building a React application.\n- Use TypeScript.\n- Use React Router. KEEP the routes in src/App.tsx\n- Always put source code in the src folder.\n- Put pages into src/pages/\n- Put components into src/components/\n- The main page (default page) is src/pages/Index.tsx\n- UPDATE the main page to include the new components. OTHERWISE, the user can NOT see any components!\n- ALWAYS try to use the shadcn/ui library.\n- Tailwind CSS: always use Tailwind CSS for styling components. Utilize Tailwind classes extensively for layout, spacing, colors, and other design aspects.\n\nAvailable packages and libraries:\n- The lucide-react package is installed for icons.\n- You ALREADY have ALL the shadcn/ui components and their dependencies installed. So you don't need to install them again.\n- You have ALL the necessary Radix UI components installed.\n- Use prebuilt components from the shadcn/ui library after importing them. Note that these files shouldn't be edited, so make new components if you need to change them.\n\n" }, { "role": "user", @@ -451,6 +451,25 @@ "additionalProperties": false } }, + { + "type": "function", + "name": "generate_image", + "description": "Generate an image using AI based on a text prompt. The generated image is saved to the project's .dyad/media directory.\n\n### When to Use\n- User requests a custom image, illustration, icon, or graphic for their app\n- User wants a hero image, background, banner, or visual asset\n- Creating images that are more visually relevant than placeholder rectangles\n\n### Prompt Guidelines\nWrite detailed, descriptive prompts. Be specific about:\n- **Subject**: What is in the image (objects, people, scenes)\n- **Style**: Photography, illustration, flat design, 3D render, watercolor, etc.\n- **Composition**: Layout, perspective, framing\n- **Colors**: Specific color palette or mood\n- **Mood**: Cheerful, professional, dramatic, minimal, etc.\n\n### Examples\n- \"A modern flat illustration of a team collaborating around a laptop, using a blue and purple color palette, clean minimal style with subtle gradients, white background\"\n- \"Professional product photography of a sleek smartphone on a marble surface, soft studio lighting, shallow depth of field, warm neutral tones\"\n\n### After Generation\nThe tool returns the file path in .dyad/media. Use the copy_file tool to copy it to the appropriate location in the project (e.g., public/assets/) and reference that path in your code.\n", + "parameters": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "A detailed, descriptive prompt for the image to generate. Be specific about colors, composition, style, mood, and subject matter. Avoid generic or vague descriptions." + } + }, + "required": [ + "prompt" + ], + "additionalProperties": false + } + }, { "type": "function", "name": "update_todos", diff --git a/e2e-tests/snapshots/local_agent_basic.spec.ts_local-agent---dump-request-1.txt b/e2e-tests/snapshots/local_agent_basic.spec.ts_local-agent---dump-request-1.txt index 2df36fac3..5d89f9447 100644 --- a/e2e-tests/snapshots/local_agent_basic.spec.ts_local-agent---dump-request-1.txt +++ b/e2e-tests/snapshots/local_agent_basic.spec.ts_local-agent---dump-request-1.txt @@ -465,6 +465,27 @@ } } }, + { + "type": "function", + "function": { + "name": "generate_image", + "description": "Generate an image using AI based on a text prompt. The generated image is saved to the project's .dyad/media directory.\n\n### When to Use\n- User requests a custom image, illustration, icon, or graphic for their app\n- User wants a hero image, background, banner, or visual asset\n- Creating images that are more visually relevant than placeholder rectangles\n\n### Prompt Guidelines\nWrite detailed, descriptive prompts. Be specific about:\n- **Subject**: What is in the image (objects, people, scenes)\n- **Style**: Photography, illustration, flat design, 3D render, watercolor, etc.\n- **Composition**: Layout, perspective, framing\n- **Colors**: Specific color palette or mood\n- **Mood**: Cheerful, professional, dramatic, minimal, etc.\n\n### Examples\n- \"A modern flat illustration of a team collaborating around a laptop, using a blue and purple color palette, clean minimal style with subtle gradients, white background\"\n- \"Professional product photography of a sleek smartphone on a marble surface, soft studio lighting, shallow depth of field, warm neutral tones\"\n\n### After Generation\nThe tool returns the file path in .dyad/media. Use the copy_file tool to copy it to the appropriate location in the project (e.g., public/assets/) and reference that path in your code.\n", + "parameters": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "A detailed, descriptive prompt for the image to generate. Be specific about colors, composition, style, mood, and subject matter. Avoid generic or vague descriptions." + } + }, + "required": [ + "prompt" + ], + "additionalProperties": false + } + } + }, { "type": "function", "function": { diff --git a/e2e-tests/snapshots/local_agent_generate_image.spec.ts_local-agent---generate-image-1.aria.yml b/e2e-tests/snapshots/local_agent_generate_image.spec.ts_local-agent---generate-image-1.aria.yml new file mode 100644 index 000000000..8b61fc069 --- /dev/null +++ b/e2e-tests/snapshots/local_agent_generate_image.spec.ts_local-agent---generate-image-1.aria.yml @@ -0,0 +1,42 @@ +- paragraph: /Generate an AI_RULES\.md file for this app\. Describe the tech stack in 5-\d+ bullet points and describe clear rules about what libraries to use for what\./ +- button "file1.txt file1.txt Edit": + - img + - text: "" + - button "Edit": + - img + - text: "" + - img +- paragraph: More EOM +- button "Copy": + - img +- img +- text: Approved +- img +- text: claude-opus-4-5 +- img +- text: less than a minute ago +- button "Copy Request ID": + - img + - text: "" +- paragraph: tc=local-agent/generate-image +- paragraph: I'll generate a hero image for your landing page. +- button "Image Generation A modern, minimal hero illustration of a rocket launching from a laptop screen, flat design style, blue and purple gradient background, clean lines": + - img + - text: "" + - img +- paragraph: I've generated the hero image and saved it to your project. You can find it in the .dyad/media directory. +- button "Copy": + - img +- img +- text: claude-opus-4-5 +- img +- text: less than a minute ago +- button "Copy Request ID": + - img + - text: "" +- button "Undo": + - img + - text: "" +- button "Retry": + - img + - text: "" \ No newline at end of file diff --git a/src/__tests__/__snapshots__/local_agent_prompt.test.ts.snap b/src/__tests__/__snapshots__/local_agent_prompt.test.ts.snap index cb0d63ac2..53bc6af70 100644 --- a/src/__tests__/__snapshots__/local_agent_prompt.test.ts.snap +++ b/src/__tests__/__snapshots__/local_agent_prompt.test.ts.snap @@ -90,6 +90,15 @@ After every edit, read the file to verify changes applied correctly. If somethin 6. **Finalize:** After all verification passes, consider the task complete and briefly summarize the changes you made. + +When a user explicitly requests custom images, illustrations, or visual media for their app: +- Use the \`generate_image\` tool instead of using placeholder images or broken external URLs +- Do NOT generate images when an existing asset, SVG, or icon library (e.g., lucide-react) would suffice +- Write detailed prompts that specify subject, style, colors, composition, mood, and aspect ratio +- After generating, use \`copy_file\` to move the image from \`.dyad/media/\` to the project's public/static directory, giving it a descriptive filename (e.g., \`public/assets/hero-banner.png\`) +- Reference the copied path in code (e.g., \`\`) + + # Tech Stack - You are building a React application. - Use TypeScript. diff --git a/src/components/chat/DyadImageGeneration.tsx b/src/components/chat/DyadImageGeneration.tsx new file mode 100644 index 000000000..daad97fbe --- /dev/null +++ b/src/components/chat/DyadImageGeneration.tsx @@ -0,0 +1,87 @@ +import type React from "react"; +import { useState, type ReactNode } from "react"; +import { ImageIcon } from "lucide-react"; +import { CustomTagState } from "./stateTypes"; +import { + DyadCard, + DyadCardHeader, + DyadBadge, + DyadExpandIcon, + DyadStateIndicator, + DyadCardContent, +} from "./DyadCardPrimitives"; + +interface DyadImageGenerationNode { + properties: { + prompt: string; + path: string; + state: CustomTagState; + }; +} + +interface DyadImageGenerationProps { + children?: ReactNode; + node?: DyadImageGenerationNode; +} + +export const DyadImageGeneration: React.FC = ({ + children, + node, +}) => { + const [isExpanded, setIsExpanded] = useState(false); + const prompt = node?.properties?.prompt ?? ""; + const imagePath = node?.properties?.path ?? ""; + const state = node?.properties?.state; + const inProgress = state === "pending"; + const aborted = state === "aborted"; + + return ( + setIsExpanded(!isExpanded)} + > + } accentColor="violet"> + Image Generation + {!isExpanded && prompt && ( + + {prompt} + + )} + {inProgress && ( + + )} + {aborted && ( + + )} +
+ +
+
+ +
+ {prompt && ( +
+ + Prompt: + +
{prompt}
+
+ )} + {imagePath && ( +
+ + Saved to: + +
+ {imagePath} +
+
+ )} + {children &&
{children}
} +
+
+
+ ); +}; diff --git a/src/components/chat/DyadMarkdownParser.tsx b/src/components/chat/DyadMarkdownParser.tsx index bfcac5a8f..11e06bc1d 100644 --- a/src/components/chat/DyadMarkdownParser.tsx +++ b/src/components/chat/DyadMarkdownParser.tsx @@ -27,6 +27,7 @@ import { DyadMcpToolResult } from "./DyadMcpToolResult"; import { DyadWebSearchResult } from "./DyadWebSearchResult"; import { DyadWebSearch } from "./DyadWebSearch"; import { DyadWebCrawl } from "./DyadWebCrawl"; +import { DyadImageGeneration } from "./DyadImageGeneration"; import { DyadCodeSearchResult } from "./DyadCodeSearchResult"; import { DyadCodeSearch } from "./DyadCodeSearch"; import { DyadRead } from "./DyadRead"; @@ -76,6 +77,7 @@ const DYAD_CUSTOM_TAGS = [ "dyad-status", "dyad-compaction", "dyad-copy", + "dyad-image-generation", // Plan mode tags "dyad-write-plan", "dyad-exit-plan", @@ -727,6 +729,21 @@ function renderCustomTag( ); + case "dyad-image-generation": + return ( + + {content} + + ); + case "dyad-status": return ( , +): Promise> { + const response = await engineFetch(ctx, "/images/generations", { + method: "POST", + body: JSON.stringify({ + prompt, + model: "gpt-image-1.5", + }), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error( + `Image generation failed: ${response.status} ${response.statusText} - ${errorText}`, + ); + } + + const data = generateImageResponseSchema.parse(await response.json()); + + if (!data.data || data.data.length === 0) { + throw new Error("Image generation returned no results"); + } + + return data.data[0]; +} + +async function saveGeneratedImage( + imageData: z.infer, + appPath: string, +): Promise { + const mediaDir = path.join(appPath, DYAD_MEDIA_DIR_NAME); + await fs.mkdir(mediaDir, { recursive: true }); + + const hash = crypto.randomBytes(8).toString("hex"); + const timestamp = Date.now(); + const fileName = `generated-${timestamp}-${hash}.png`; + const filePath = path.join(mediaDir, fileName); + const relativePath = path.join(DYAD_MEDIA_DIR_NAME, fileName); + + if (imageData.b64_json) { + const buffer = Buffer.from(imageData.b64_json, "base64"); + await fs.writeFile(filePath, buffer); + } else if (imageData.url) { + const response = await fetch(imageData.url); + if (!response.ok) { + throw new Error(`Failed to download generated image: ${response.status}`); + } + const arrayBuffer = await response.arrayBuffer(); + await fs.writeFile(filePath, Buffer.from(arrayBuffer)); + } else { + throw new Error("Image generation returned no image data"); + } + + return relativePath; +} + +export const generateImageTool: ToolDefinition< + z.infer +> = { + name: "generate_image", + description: DESCRIPTION, + inputSchema: generateImageSchema, + defaultConsent: "always", + modifiesState: true, + + isEnabled: (ctx) => ctx.isDyadPro, + + getConsentPreview: (args) => `Generate image: "${args.prompt}"`, + + buildXml: (args, isComplete) => { + if (!args.prompt) return undefined; + if (isComplete) return undefined; + return ``; + }, + + execute: async (args, ctx: AgentContext) => { + logger.log(`Executing image generation with prompt: ${args.prompt}`); + + ctx.onXmlStream( + ``, + ); + + try { + const imageData = await callGenerateImage(args.prompt, ctx); + + const relativePath = await saveGeneratedImage(imageData, ctx.appPath); + + ctx.onXmlComplete( + `${escapeXmlContent(relativePath)}`, + ); + + logger.log(`Image generation completed, saved to: ${relativePath}`); + + return `Image generated and saved to: ${relativePath}\nUse the copy_file tool to copy it from "${relativePath}" to the appropriate location in the project (e.g., public/assets/), then reference the copied path in your code.`; + } catch (error) { + ctx.onXmlComplete( + ``, + ); + throw error; + } + }, +}; diff --git a/src/prompts/local_agent_prompt.ts b/src/prompts/local_agent_prompt.ts index 792105319..695915a95 100644 --- a/src/prompts/local_agent_prompt.ts +++ b/src/prompts/local_agent_prompt.ts @@ -190,6 +190,19 @@ You have READ-ONLY tools at your disposal to understand the codebase. Follow the [[AI_RULES]] `; +// ============================================================================ +// Image Generation Block (Pro mode only) +// ============================================================================ + +const IMAGE_GENERATION_BLOCK = ` +When a user explicitly requests custom images, illustrations, or visual media for their app: +- Use the \`generate_image\` tool instead of using placeholder images or broken external URLs +- Do NOT generate images when an existing asset, SVG, or icon library (e.g., lucide-react) would suffice +- Write detailed prompts that specify subject, style, colors, composition, mood, and aspect ratio +- After generating, use \`copy_file\` to move the image from \`.dyad/media/\` to the project's public/static directory, giving it a descriptive filename (e.g., \`public/assets/hero-banner.png\`) +- Reference the copied path in code (e.g., \`\`) +`; + // ============================================================================ // Full System Prompts (assembled from blocks) // ============================================================================ @@ -213,6 +226,8 @@ ${PRO_FILE_EDITING_TOOL_SELECTION_BLOCK} ${PRO_DEVELOPMENT_WORKFLOW_BLOCK} +${IMAGE_GENERATION_BLOCK} + [[AI_RULES]] `; diff --git a/testing/fake-llm-server/index.ts b/testing/fake-llm-server/index.ts index 1e8183f89..860d44730 100644 --- a/testing/fake-llm-server/index.ts +++ b/testing/fake-llm-server/index.ts @@ -248,6 +248,32 @@ app.post("/engine/v1/tools/code-search", (req, res) => { } }); +// Dyad Engine image generation endpoint for generate_image tool +app.post("/engine/v1/images/generations", (req, res) => { + const { prompt, model } = req.body; + console.log( + `* images/generations: model=${model}, prompt="${prompt?.slice(0, 50)}..."`, + ); + + try { + // Return a small 1x1 white PNG as base64 for testing + const TINY_PNG_B64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="; + + res.json({ + created: Math.floor(Date.now() / 1000), + data: [ + { + b64_json: TINY_PNG_B64, + }, + ], + }); + } catch (error) { + console.error(`* images/generations error:`, error); + res.status(400).json({ error: String(error) }); + } +}); + // Start the server const server = createServer(app); server.listen(PORT, () => {