diff --git a/packages/vscode-extension/package.json b/packages/vscode-extension/package.json index 417d1dbc4..3fd4bb476 100644 --- a/packages/vscode-extension/package.json +++ b/packages/vscode-extension/package.json @@ -288,6 +288,18 @@ "command": "RNIDE.removeLicense", "title": "Remove license", "category": "Radon IDE" + }, + { + "enablement": "!RNIDE.MCPToolTestsRunning", + "command": "RNIDE.testChatToolUsage", + "title": "Test AI tool usage", + "category": "Radon IDE" + }, + { + "enablement": "RNIDE.MCPToolTestsRunning", + "command": "RNIDE.terminateChatToolTest", + "title": "Terminate MCP tool tests", + "category": "Radon IDE" } ], "keybindings": [ diff --git a/packages/vscode-extension/src/ai/tests/aiChatTester.ts b/packages/vscode-extension/src/ai/tests/aiChatTester.ts new file mode 100644 index 000000000..dfa64047a --- /dev/null +++ b/packages/vscode-extension/src/ai/tests/aiChatTester.ts @@ -0,0 +1,231 @@ +import { randomBytes } from "crypto"; +import { readFileSync } from "fs"; +import { mkdtemp, rm } from "fs/promises"; +import { tmpdir } from "os"; +import path from "path"; +import { window, commands, Uri, workspace, StatusBarAlignment, ThemeColor } from "vscode"; +import { Logger } from "../../Logger"; +import { exec } from "../../utilities/subprocess"; +import { Platform } from "../../utilities/platform"; +import { IDE } from "../../project/ide"; +import { testCases } from "./chatTestCases"; +import { Response, ToolCallResponse, ChatTestResult, ChatTestCase, ChatData } from "./models"; + +export const GIT_PATH = Platform.select({ + macos: "git", + windows: "git.exe", + linux: "git", +}); + +function isToolCallResponse(response: Response): response is ToolCallResponse { + // Smart-casting with `Exclude` does not work, which is why this utility function is necessary + return response.kind === "toolInvocationSerialized"; +} + +async function clearEdits() { + // Stop previous response - prevents pop-ups on `workbench.action.chat.newChat`. + await commands.executeCommand("workbench.action.chat.cancel"); + + // Move cursor to input - REQUIRED for `chatEditing.acceptAllFiles`. + await commands.executeCommand("workbench.panel.chat.view.copilot.focus"); + + // Rejection requires user confirmation, acceptance does not. + await commands.executeCommand("chatEditing.acceptAllFiles"); + + const gitUri = workspace.workspaceFolders?.[0].uri; + + if (!gitUri) { + // This case should never occur when a test app is loaded. + return; + } + + // Revert all changes via git - we CANNOT use `commands.executeCommand`, as it requires user confirmation. + await exec(GIT_PATH, ["-C", gitUri.fsPath, "restore", "."]); +} + +async function setGlobalTestsRunning(areTestsRunning: boolean) { + await commands.executeCommand("setContext", "RNIDE.MCPToolTestsRunning", areTestsRunning); +} + +function awaitTestTerminationOrTimeout(ideInstance: IDE, testTimeout: number): Promise { + return new Promise((resolve) => { + const disposable = ideInstance.onStateChanged(() => { + // Using partial state here is much more cumbersome and less readable. + ideInstance.getState().then((state) => { + const testsRunning = state.workspaceConfiguration.radonAI.areMCPTestsRunning; + if (testsRunning === false) { + disposable.dispose(); + clearTimeout(timeout); + resolve(false); + } + }); + }); + + const timeout = setTimeout(() => { + disposable.dispose(); + resolve(true); + }, testTimeout); + }); +} + +async function setTestStatus(areTestsRunning: boolean, ideInstance: IDE) { + await setGlobalTestsRunning(areTestsRunning); + await ideInstance.updateState({ + workspaceConfiguration: { + radonAI: { + areMCPTestsRunning: areTestsRunning, + }, + }, + }); +} + +function getIdeInstance() { + const ide = IDE.getInstanceIfExists(); + + if (!ide) { + throw new Error("IDE instance is not initialized. Ensure the Radon IDE panel is open."); + } + + return ide; +} + +/** + * Executor for `RNIDE.terminateChatToolTest` VSCode command. + * Terminates ongoing MCP tool tests, which were initiated by `RNIDE.testChatToolUsage` VSCode command. + */ +export async function terminateChatToolTest() { + const ideInstance = getIdeInstance(); + await setTestStatus(false, ideInstance); +} + +/** + * Executor for `RNIDE.testChatToolUsage` VSCode command. + * Temporarily takes control over the AI chat tab, testing its responses to various prompts. + * Running this command may interfere with other VSCode functionalities as well. + */ +export async function testChatToolUsage(): Promise { + const ideInstance = getIdeInstance(); + const runStatus: ChatTestResult[] = []; + + await setTestStatus(true, ideInstance); + + const fail = (testCase: ChatTestCase, cause: string) => { + runStatus.push({ + cause, + success: false, + prompt: testCase.prompt, + }); + }; + + const success = (testCase: ChatTestCase) => { + runStatus.push({ + cause: null, + success: true, + prompt: testCase.prompt, + }); + }; + + // - `showInformationMessage` cannot be programmatically dismissed + // - `showQuickPick` is a list-selection - does not look right + // - `createStatusBarItem` looks good, and can be dismissed both programmatically and by the user + const statusBar = window.createStatusBarItem(StatusBarAlignment.Left, 0); + statusBar.command = "RNIDE.terminateChatToolTest"; + statusBar.text = "$(debug-stop) MCP tests running — Terminate"; + statusBar.tooltip = "Click to terminate running E2E tests"; + statusBar.color = new ThemeColor("statusBar.foreground"); + statusBar.backgroundColor = new ThemeColor("statusBarItem.errorBackground"); + statusBar.show(); + + const dir = await mkdtemp(path.join(tmpdir(), "radon-chat-exports-")); + + for (const testCase of testCases) { + await clearEdits(); + + await commands.executeCommand("workbench.action.chat.newChat"); + await commands.executeCommand("workbench.action.chat.openagent", testCase.prompt); + + const shouldContinue = await awaitTestTerminationOrTimeout(ideInstance, 10_000); + + if (!shouldContinue) { + fail(testCase, "User input: Test was terminated early."); + break; + } + + const filepath = path.join(dir, randomBytes(8).toString("hex") + ".json"); + + await commands.executeCommand("workbench.action.chat.export", Uri.parse(filepath)); + + let chatData; + try { + const exportedText = readFileSync(filepath).toString(); + chatData = JSON.parse(exportedText) as ChatData; + } catch { + fail(testCase, "Internal error: `workbench.action.chat.export` did not work."); + continue; + } + + if (chatData.requests.length === 0) { + fail(testCase, "Internal error: `workbench.action.chat.openagent` did not work."); + continue; + } + + if (chatData.requests.length > 1) { + fail(testCase, "Internal error: `workbench.action.chat.newChat` did not work."); + continue; + } + + const responses = chatData.requests[0].response; + + const toolCalls = responses.filter((response) => isToolCallResponse(response)); + + if (toolCalls.length === 0) { + fail(testCase, "No tools were called."); + continue; + } + + const otherCalledTools = []; + let wasExpectedToolCalled = false; + + for (const toolCall of toolCalls) { + if (testCase.allowedToolIds.includes(toolCall.toolId)) { + wasExpectedToolCalled = true; + success(testCase); + break; + } + + otherCalledTools.push(toolCall.toolId); + } + + if (!wasExpectedToolCalled) { + const expected = `Expected: ${testCase.allowedToolIds.join(" | ")}`; + const received = `Received: ${otherCalledTools.join(", ")}`; + const cause = `${expected}. ${received}`; + fail(testCase, cause); + } + } + + await setTestStatus(false, ideInstance); + + statusBar.hide(); + statusBar.dispose(); + + rm(dir, { recursive: true }).catch((_e) => { + // silence the errors, it's fine + }); + + await clearEdits(); + + const failReasons = runStatus + .map((v) => `${v.success ? " OK " : "FAIL"}${v.cause !== null ? ` | Error: ${v.cause}` : ""}`) + .join("\n"); + + const correctCount = runStatus + .map((v) => (v.success ? 1 : 0) as number) + .reduce((acc, v) => v + acc); + + const totalCount = runStatus.length; + const correctPercent = ((correctCount / totalCount) * 100).toFixed(1); + + const response = `\n=== AI TEST RESULTS ===\n${failReasons}\n# TOTAL CORRECT: ${correctCount}/${totalCount} (${correctPercent}%)`; + Logger.log(response); +} diff --git a/packages/vscode-extension/src/ai/tests/chatTestCases.ts b/packages/vscode-extension/src/ai/tests/chatTestCases.ts new file mode 100644 index 000000000..3d1451b03 --- /dev/null +++ b/packages/vscode-extension/src/ai/tests/chatTestCases.ts @@ -0,0 +1,94 @@ +import { ChatTestCase } from "./models"; + +export const testCases: ChatTestCase[] = [ + { + prompt: "How to use Shared Element Transitions in Reanimated 4?", + allowedToolIds: ["query_documentation"], + }, + { + prompt: "How to use SETs in Reanimated?", + allowedToolIds: ["query_documentation"], + }, + { + prompt: "Implement an example interaction with a local LLM in my app.", + allowedToolIds: ["query_documentation"], + }, + { + prompt: "Add LLM chat to my app.", + allowedToolIds: ["query_documentation"], + }, + + { + prompt: "My button in the center of the screen is malformed.", + allowedToolIds: ["view_component_tree", "view_screenshot"], + }, + { + prompt: "The orange button is ugly. Fix it.", + allowedToolIds: ["view_component_tree", "view_screenshot"], + }, + + { + prompt: "Restart the app.", + allowedToolIds: ["reload_application"], + }, + { + prompt: "The app is frozen. Can you reset it?", + allowedToolIds: ["reload_application"], + }, + + { + prompt: "Why did the app just crash?", + allowedToolIds: ["view_application_logs"], + }, + { + prompt: "Are there any errors in the logs?", + allowedToolIds: ["view_application_logs"], + }, + { + prompt: "Debug the error thrown when I clicked the login button.", + allowedToolIds: ["view_application_logs", "view_component_tree"], + }, + + { + prompt: "Does the layout look broken to you?", + allowedToolIds: ["view_screenshot"], + }, + { + prompt: "I think the text is being cut off on the right side.", + allowedToolIds: ["view_screenshot"], + }, + { + prompt: "Verify if the dark mode colors are applied correctly.", + allowedToolIds: ["view_screenshot"], + }, + { + prompt: "Take a look at the current screen.", + allowedToolIds: ["view_screenshot"], + }, + + { + prompt: "What is the hierarchy of the current screen?", + allowedToolIds: ["view_component_tree"], + }, + { + prompt: "Show me the props passed to the Header component.", + allowedToolIds: ["view_component_tree"], + }, + { + prompt: "Is the 'Submit' button currently inside a SafeAreaView?", + allowedToolIds: ["view_component_tree"], + }, + { + prompt: "Find the component ID for the bottom navigation bar.", + allowedToolIds: ["view_component_tree"], + }, + + { + prompt: "Why is the banner not showing up?", + allowedToolIds: ["view_component_tree", "view_application_logs", "view_screenshot"], + }, + { + prompt: "Inspect the padding on the user profile card.", + allowedToolIds: ["view_component_tree", "view_screenshot"], + }, +]; diff --git a/packages/vscode-extension/src/ai/tests/models.ts b/packages/vscode-extension/src/ai/tests/models.ts new file mode 100644 index 000000000..7ec2100f2 --- /dev/null +++ b/packages/vscode-extension/src/ai/tests/models.ts @@ -0,0 +1,37 @@ +export interface ChatData { + requests: Request[]; +} + +export interface Request { + response: Response[]; +} + +export type Response = ToolCallResponse | UnknownResponse; + +export interface UnknownResponse { + // `Exclude` resolves to `string` (does not work) + kind: unknown; +} + +export type AllowedToolId = + | "query_documentation" + | "view_screenshot" + | "view_component_tree" + | "view_application_logs" + | "reload_application"; + +export interface ToolCallResponse { + kind: "toolInvocationSerialized"; + toolId: AllowedToolId; +} + +export interface ChatTestCase { + prompt: string; + allowedToolIds: AllowedToolId[]; +} + +export interface ChatTestResult { + prompt: string; + success: boolean; + cause: string | null; +} diff --git a/packages/vscode-extension/src/common/State.ts b/packages/vscode-extension/src/common/State.ts index 7e0456af7..9f82da74b 100644 --- a/packages/vscode-extension/src/common/State.ts +++ b/packages/vscode-extension/src/common/State.ts @@ -92,6 +92,7 @@ export type GeneralSettings = { export type RadonAISettings = { enableRadonAI: boolean; + areMCPTestsRunning: boolean; }; export type UserInterfaceSettings = { @@ -624,6 +625,7 @@ export const initialState: State = { }, radonAI: { enableRadonAI: true, + areMCPTestsRunning: false, }, userInterface: { panelLocation: "tab", diff --git a/packages/vscode-extension/src/extension.ts b/packages/vscode-extension/src/extension.ts index b1424dea2..86090b23b 100644 --- a/packages/vscode-extension/src/extension.ts +++ b/packages/vscode-extension/src/extension.ts @@ -40,6 +40,7 @@ import { AdminRestrictedFunctionalityError, PaywalledFunctionalityError } from " import { registerRadonAI } from "./ai/mcp/RadonMcpController"; import { MaestroCodeLensProvider } from "./providers/MaestroCodeLensProvider"; import { removeLicense } from "./utilities/license"; +import { terminateChatToolTest, testChatToolUsage } from "./ai/tests/aiChatTester"; import { getTelemetryReporter } from "./utilities/telemetry"; import { getEditorType } from "./utilities/editorType"; @@ -325,6 +326,15 @@ export async function activate(context: ExtensionContext) { context.subscriptions.push( commands.registerCommand("RNIDE.removeLicense", removeLicenseWithConfirmation) ); + + context.subscriptions.push( + commands.registerCommand("RNIDE.testChatToolUsage", testChatToolUsage) + ); + + context.subscriptions.push( + commands.registerCommand("RNIDE.terminateChatToolTest", terminateChatToolTest) + ); + // Debug adapter used by custom launch configuration, we register it in case someone tries to run the IDE configuration // The current workflow is that people shouldn't run it, but since it is listed under launch options it might happen // When it does happen, we open the IDE panel and restart the app. diff --git a/packages/vscode-extension/src/utilities/workspaceConfiguration.ts b/packages/vscode-extension/src/utilities/workspaceConfiguration.ts index d07fc92ee..29cfb0d86 100644 --- a/packages/vscode-extension/src/utilities/workspaceConfiguration.ts +++ b/packages/vscode-extension/src/utilities/workspaceConfiguration.ts @@ -19,7 +19,10 @@ const WorkspaceConfigurationKeyMap = { enableExperimentalElementInspector: "general.enableExperimentalElementInspector", inspectorExcludePattern: "general.inspectorExcludePattern", }, - radonAI: { enableRadonAI: "radonAI.enabledBoolean" }, + radonAI: { + enableRadonAI: "radonAI.enabledBoolean", + areMCPTestsRunning: "radonAI.areMCPTestsRunning", + }, userInterface: { panelLocation: "userInterface.panelLocation", showDeviceFrame: "userInterface.showDeviceFrame", @@ -58,6 +61,8 @@ export function getCurrentWorkspaceConfiguration(config: WorkspaceConfiguration) radonAI: { enableRadonAI: config.get(WorkspaceConfigurationKeyMap.radonAI.enableRadonAI) ?? true, + areMCPTestsRunning: + config.get(WorkspaceConfigurationKeyMap.radonAI.areMCPTestsRunning) ?? true, }, userInterface: { panelLocation: