Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
542f3b1
Add initial testing code
latekvo Dec 10, 2025
35aac32
better typing, result guards
latekvo Dec 10, 2025
d7ecdf5
add allowed tools check
latekvo Dec 10, 2025
3e29cf1
add basic error reporting, cleanup
latekvo Dec 10, 2025
66aa875
register command for running tests
latekvo Dec 10, 2025
acc153d
more metadata in status reports
latekvo Dec 10, 2025
7f0734c
Merge branch 'main' into @latekvo/create-ai-tests
latekvo Dec 10, 2025
51d77e4
stylistic fix
latekvo Dec 10, 2025
557ccf0
Merge branch '@latekvo/create-ai-tests' of https://github.com/softwar…
latekvo Dec 10, 2025
9b8f3d8
initial file-reading impl (syncing work, branch switch)
latekvo Dec 10, 2025
4da6725
fix workaround for not awaiting completion
latekvo Dec 10, 2025
a36f7ee
system-agnostic random path
latekvo Dec 10, 2025
4b6b795
add undo before new chat
latekvo Dec 10, 2025
898d595
ensure agent mode is used
latekvo Dec 10, 2025
e3eed04
remove all popups requiring human input, add todos
latekvo Dec 11, 2025
f68ddad
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 11, 2025
2b7df4d
sync
latekvo Dec 12, 2025
e559400
prevent more popups
latekvo Dec 12, 2025
317d18a
pretty result printing
latekvo Dec 12, 2025
b661880
add git restore on each run
latekvo Dec 15, 2025
df1c643
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 15, 2025
106dfb7
cleanup
latekvo Dec 15, 2025
e3b278c
replace sleep with known util
latekvo Dec 15, 2025
f0b8865
add initial termination implementation
latekvo Dec 16, 2025
204d571
hook up the state manager
latekvo Dec 18, 2025
c8021e0
remove resolved todo
latekvo Dec 18, 2025
b9889e2
fix: use correct status update command
latekvo Dec 18, 2025
54595ba
fix invalid command name
latekvo Dec 18, 2025
45a66a9
fix type errors, fix typecasting ide as state manager
latekvo Dec 18, 2025
104548e
use partial state instead
latekvo Dec 18, 2025
4bdb9f6
fix naming
latekvo Dec 18, 2025
066f377
await test state setting
latekvo Dec 19, 2025
3ceb9e0
minor comment change
latekvo Dec 19, 2025
1f25e8a
prevent multiple launches of the tool tests
latekvo Dec 19, 2025
5608e80
add docstrings for the command executors
latekvo Dec 19, 2025
9ea5386
await global context setting
latekvo Dec 19, 2025
c924ef8
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 30, 2025
a24bfab
add more test cases
latekvo Jan 7, 2026
d606541
simplify timeout and termination code, remove unwanted test case
latekvo Jan 7, 2026
cac02bd
improve output formatting, data
latekvo Jan 9, 2026
efc9fcf
fix bool inversion
latekvo Jan 12, 2026
d38f74b
finally implement early termination
latekvo Jan 12, 2026
ea4cf4a
fix a lot of typos and minor issues
latekvo Jan 12, 2026
51bcee9
cleanup transcript directory
latekvo Jan 12, 2026
336c1c0
await final edit clear
latekvo Jan 12, 2026
097eb1c
fix tests failing on success
latekvo Jan 12, 2026
5a5fa17
add basic Observable
latekvo Jan 12, 2026
a05d0f4
invoke listeners on observable set
latekvo Jan 12, 2026
9ac3b45
revert Observable after further consideration
latekvo Jan 12, 2026
179136b
move radonAI global position, revert Observable again
latekvo Jan 12, 2026
5706a86
simplify state checking logic
latekvo Jan 12, 2026
28f6573
fix workspace configs
latekvo Jan 12, 2026
b53a0cf
isolate types and test cases
latekvo Jan 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/vscode-extension/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,18 @@
"command": "RNIDE.removeLicense",
"title": "Remove license",
"category": "Radon IDE"
},
{
"enablement": "!RNIDE.MCPToolTestsRunning",
"command": "RNIDE.testChatToolUsage",
"title": "Test AI tool usage",
"category": "Radon IDE"
},
{
"enablement": "RNIDE.MCPToolTestsRunning",
"command": "RNIDE.terminateChatToolTest",
"title": "Terminate MCP tool tests",
"category": "Radon IDE"
}
],
"keybindings": [
Expand Down
231 changes: 231 additions & 0 deletions packages/vscode-extension/src/ai/tests/aiChatTester.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import { randomBytes } from "crypto";
import { readFileSync } from "fs";
import { mkdtemp, rm } from "fs/promises";
import { tmpdir } from "os";
import path from "path";
import { window, commands, Uri, workspace, StatusBarAlignment, ThemeColor } from "vscode";
import { Logger } from "../../Logger";
import { exec } from "../../utilities/subprocess";
import { Platform } from "../../utilities/platform";
import { IDE } from "../../project/ide";
import { testCases } from "./chatTestCases";
import { Response, ToolCallResponse, ChatTestResult, ChatTestCase, ChatData } from "./models";

export const GIT_PATH = Platform.select({
macos: "git",
windows: "git.exe",
linux: "git",
});

function isToolCallResponse(response: Response): response is ToolCallResponse {
// Smart-casting with `Exclude<string, "literal">` does not work, which is why this utility function is necessary
return response.kind === "toolInvocationSerialized";
}

async function clearEdits() {
// Stop previous response - prevents pop-ups on `workbench.action.chat.newChat`.
await commands.executeCommand("workbench.action.chat.cancel");

// Move cursor to input - REQUIRED for `chatEditing.acceptAllFiles`.
await commands.executeCommand("workbench.panel.chat.view.copilot.focus");

// Rejection requires user confirmation, acceptance does not.
await commands.executeCommand("chatEditing.acceptAllFiles");

const gitUri = workspace.workspaceFolders?.[0].uri;

if (!gitUri) {
// This case should never occur when a test app is loaded.
return;
}

// Revert all changes via git - we CANNOT use `commands.executeCommand`, as it requires user confirmation.
await exec(GIT_PATH, ["-C", gitUri.fsPath, "restore", "."]);
}

async function setGlobalTestsRunning(areTestsRunning: boolean) {
await commands.executeCommand("setContext", "RNIDE.MCPToolTestsRunning", areTestsRunning);
}

function awaitTestTerminationOrTimeout(ideInstance: IDE, testTimeout: number): Promise<boolean> {
return new Promise((resolve) => {
const disposable = ideInstance.onStateChanged(() => {
// Using partial state here is much more cumbersome and less readable.
ideInstance.getState().then((state) => {
const testsRunning = state.workspaceConfiguration.radonAI.areMCPTestsRunning;
if (testsRunning === false) {
disposable.dispose();
clearTimeout(timeout);
resolve(false);
}
});
});

const timeout = setTimeout(() => {
disposable.dispose();
resolve(true);
}, testTimeout);
});
}

async function setTestStatus(areTestsRunning: boolean, ideInstance: IDE) {
await setGlobalTestsRunning(areTestsRunning);
await ideInstance.updateState({
workspaceConfiguration: {
radonAI: {
areMCPTestsRunning: areTestsRunning,
},
},
});
}

function getIdeInstance() {
const ide = IDE.getInstanceIfExists();

if (!ide) {
throw new Error("IDE instance is not initialized. Ensure the Radon IDE panel is open.");
}

return ide;
}

/**
* Executor for `RNIDE.terminateChatToolTest` VSCode command.
* Terminates ongoing MCP tool tests, which were initiated by `RNIDE.testChatToolUsage` VSCode command.
*/
export async function terminateChatToolTest() {
const ideInstance = getIdeInstance();
await setTestStatus(false, ideInstance);
}

/**
* Executor for `RNIDE.testChatToolUsage` VSCode command.
* Temporarily takes control over the AI chat tab, testing its responses to various prompts.
* Running this command may interfere with other VSCode functionalities as well.
*/
export async function testChatToolUsage(): Promise<void> {
const ideInstance = getIdeInstance();
const runStatus: ChatTestResult[] = [];

await setTestStatus(true, ideInstance);

const fail = (testCase: ChatTestCase, cause: string) => {
runStatus.push({
cause,
success: false,
prompt: testCase.prompt,
});
};

const success = (testCase: ChatTestCase) => {
runStatus.push({
cause: null,
success: true,
prompt: testCase.prompt,
});
};

// - `showInformationMessage` cannot be programmatically dismissed
// - `showQuickPick` is a list-selection - does not look right
// - `createStatusBarItem` looks good, and can be dismissed both programmatically and by the user
const statusBar = window.createStatusBarItem(StatusBarAlignment.Left, 0);
statusBar.command = "RNIDE.terminateChatToolTest";
statusBar.text = "$(debug-stop) MCP tests running — Terminate";
statusBar.tooltip = "Click to terminate running E2E tests";
statusBar.color = new ThemeColor("statusBar.foreground");
statusBar.backgroundColor = new ThemeColor("statusBarItem.errorBackground");
statusBar.show();

const dir = await mkdtemp(path.join(tmpdir(), "radon-chat-exports-"));

for (const testCase of testCases) {
await clearEdits();

await commands.executeCommand("workbench.action.chat.newChat");
await commands.executeCommand("workbench.action.chat.openagent", testCase.prompt);

const shouldContinue = await awaitTestTerminationOrTimeout(ideInstance, 10_000);

if (!shouldContinue) {
fail(testCase, "User input: Test was terminated early.");
break;
}

const filepath = path.join(dir, randomBytes(8).toString("hex") + ".json");

await commands.executeCommand("workbench.action.chat.export", Uri.parse(filepath));

let chatData;
try {
const exportedText = readFileSync(filepath).toString();
chatData = JSON.parse(exportedText) as ChatData;
} catch {
fail(testCase, "Internal error: `workbench.action.chat.export` did not work.");
continue;
}

if (chatData.requests.length === 0) {
fail(testCase, "Internal error: `workbench.action.chat.openagent` did not work.");
continue;
}

if (chatData.requests.length > 1) {
fail(testCase, "Internal error: `workbench.action.chat.newChat` did not work.");
continue;
}

const responses = chatData.requests[0].response;

const toolCalls = responses.filter((response) => isToolCallResponse(response));

if (toolCalls.length === 0) {
fail(testCase, "No tools were called.");
continue;
}

const otherCalledTools = [];
let wasExpectedToolCalled = false;

for (const toolCall of toolCalls) {
if (testCase.allowedToolIds.includes(toolCall.toolId)) {
wasExpectedToolCalled = true;
success(testCase);
break;
}

otherCalledTools.push(toolCall.toolId);
}

if (!wasExpectedToolCalled) {
const expected = `Expected: ${testCase.allowedToolIds.join(" | ")}`;
const received = `Received: ${otherCalledTools.join(", ")}`;
const cause = `${expected}. ${received}`;
fail(testCase, cause);
}
}

await setTestStatus(false, ideInstance);

statusBar.hide();
statusBar.dispose();

rm(dir, { recursive: true }).catch((_e) => {
// silence the errors, it's fine
});

await clearEdits();

const failReasons = runStatus
.map((v) => `${v.success ? " OK " : "FAIL"}${v.cause !== null ? ` | Error: ${v.cause}` : ""}`)
.join("\n");

const correctCount = runStatus
.map((v) => (v.success ? 1 : 0) as number)
.reduce((acc, v) => v + acc);

const totalCount = runStatus.length;
const correctPercent = ((correctCount / totalCount) * 100).toFixed(1);

const response = `\n=== AI TEST RESULTS ===\n${failReasons}\n# TOTAL CORRECT: ${correctCount}/${totalCount} (${correctPercent}%)`;
Logger.log(response);
}
94 changes: 94 additions & 0 deletions packages/vscode-extension/src/ai/tests/chatTestCases.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import { ChatTestCase } from "./models";

export const testCases: ChatTestCase[] = [
{
prompt: "How to use Shared Element Transitions in Reanimated 4?",
allowedToolIds: ["query_documentation"],
},
{
prompt: "How to use SETs in Reanimated?",
allowedToolIds: ["query_documentation"],
},
{
prompt: "Implement an example interaction with a local LLM in my app.",
allowedToolIds: ["query_documentation"],
},
{
prompt: "Add LLM chat to my app.",
allowedToolIds: ["query_documentation"],
},

{
prompt: "My button in the center of the screen is malformed.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},
{
prompt: "The orange button is ugly. Fix it.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},

{
prompt: "Restart the app.",
allowedToolIds: ["reload_application"],
},
{
prompt: "The app is frozen. Can you reset it?",
allowedToolIds: ["reload_application"],
},

{
prompt: "Why did the app just crash?",
allowedToolIds: ["view_application_logs"],
},
{
prompt: "Are there any errors in the logs?",
allowedToolIds: ["view_application_logs"],
},
{
prompt: "Debug the error thrown when I clicked the login button.",
allowedToolIds: ["view_application_logs", "view_component_tree"],
},

{
prompt: "Does the layout look broken to you?",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "I think the text is being cut off on the right side.",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "Verify if the dark mode colors are applied correctly.",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "Take a look at the current screen.",
allowedToolIds: ["view_screenshot"],
},

{
prompt: "What is the hierarchy of the current screen?",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Show me the props passed to the Header component.",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Is the 'Submit' button currently inside a SafeAreaView?",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Find the component ID for the bottom navigation bar.",
allowedToolIds: ["view_component_tree"],
},

{
prompt: "Why is the banner not showing up?",
allowedToolIds: ["view_component_tree", "view_application_logs", "view_screenshot"],
},
{
prompt: "Inspect the padding on the user profile card.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},
];
37 changes: 37 additions & 0 deletions packages/vscode-extension/src/ai/tests/models.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface ChatData {
requests: Request[];
}

export interface Request {
response: Response[];
}

export type Response = ToolCallResponse | UnknownResponse;

export interface UnknownResponse {
// `Exclude<string, "literal">` resolves to `string` (does not work)
kind: unknown;
}

export type AllowedToolId =
| "query_documentation"
| "view_screenshot"
| "view_component_tree"
| "view_application_logs"
| "reload_application";

export interface ToolCallResponse {
kind: "toolInvocationSerialized";
toolId: AllowedToolId;
}

export interface ChatTestCase {
prompt: string;
allowedToolIds: AllowedToolId[];
}

export interface ChatTestResult {
prompt: string;
success: boolean;
cause: string | null;
}
2 changes: 2 additions & 0 deletions packages/vscode-extension/src/common/State.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ export type GeneralSettings = {

export type RadonAISettings = {
enableRadonAI: boolean;
areMCPTestsRunning: boolean;
};

export type UserInterfaceSettings = {
Expand Down Expand Up @@ -624,6 +625,7 @@ export const initialState: State = {
},
radonAI: {
enableRadonAI: true,
areMCPTestsRunning: false,
},
userInterface: {
panelLocation: "tab",
Expand Down
Loading
Loading