replay evals (RooCodeInc#4140)

0xToshii · web-flow · commit 336eb46547fe · 2025-06-10T17:11:06.000-07:00
diff --git a/evals/cli/src/commands/runDiffEval.ts b/evals/cli/src/commands/runDiffEval.ts
@@ -13,6 +13,7 @@ interface RunDiffEvalOptions {
 	verbose: boolean
 	testPath: string
 	outputPath: string
+	replay: boolean
 }
 
 export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
@@ -50,6 +51,10 @@ export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
 		args.push("--parallel")
 	}
 
+	if (options.replay) {
+		args.push("--replay")
+	}
+
 	if (options.verbose) {
 		args.push("--verbose")
 	}
diff --git a/evals/cli/src/index.ts b/evals/cli/src/index.ts
@@ -91,6 +91,7 @@ program
 	.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
 	.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
 	.option("--parallel", "Run tests in parallel", false)
+	.option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
 	.option("-v, --verbose", "Enable verbose logging", false)
 	.action(async (options) => {
 		try {
diff --git a/evals/diff_editing/ClineWrapper.ts b/evals/diff_editing/ClineWrapper.ts
@@ -8,7 +8,7 @@ import {
 	parseAssistantMessageV3,
 	AssistantMessageContent,
 } from "./parsing/parse-assistant-message-06-06-25" // "../../src/core/assistant-message"
-import { constructNewFileContentV2 } from "./diff-apply/diff-06-06-25"
+import { constructNewFileContent as constructNewFileContentV1, constructNewFileContentV2 } from "./diff-apply/diff-06-06-25"
 import { constructNewFileContent as constructNewFileContentV3 } from "../../src/core/assistant-message/diff" // this defaults to the new v1 when called
 
 type ParseAssistantMessageFn = (message: string) => AssistantMessageContent[]
@@ -21,6 +21,7 @@ const parsingFunctions: Record<string, ParseAssistantMessageFn> = {
 }
 
 const diffEditingFunctions: Record<string, ConstructNewFileContentFn> = {
+	constructNewFileContentV1: constructNewFileContentV1,
 	constructNewFileContentV2: constructNewFileContentV2,
 	constructNewFileContentV3: constructNewFileContentV3, // position invariant diff
 }
@@ -114,10 +115,10 @@ export async function runSingleEvaluation(input: TestInput): Promise<TestResult>
 			parsingFunction,
 			diffEditFunction,
 			thinkingBudgetTokens,
+			originalDiffEditToolCallMessage,
 		} = input
 
 		const requiredParams = {
-			apiKey,
 			systemPrompt,
 			messages,
 			modelId,
@@ -163,17 +164,26 @@ export async function runSingleEvaluation(input: TestInput): Promise<TestResult>
 			},
 		}
 
-		const openRouterHandler = new OpenRouterHandler(options)
-
 		// Get the output of streaming output of this llm call
 		let streamResult: StreamResult
-		try {
-			streamResult = await processStream(openRouterHandler, systemPrompt, messages)
-		} catch (error: any) {
-			return {
-				success: false,
-				error: "llm_stream_error",
-				errorString: error.message || error.toString(),
+		if (originalDiffEditToolCallMessage !== undefined) {
+			// Replay mode: mock the stream result
+			streamResult = {
+				assistantMessage: originalDiffEditToolCallMessage,
+				reasoningMessage: "",
+				usage: { inputTokens: 0, outputTokens: 0, cacheWriteTokens: 0, cacheReadTokens: 0, totalCost: 0 },
+			}
+		} else {
+			// Live mode: existing API call logic
+			try {
+				const openRouterHandler = new OpenRouterHandler(options)
+				streamResult = await processStream(openRouterHandler, systemPrompt, messages)
+			} catch (error: any) {
+				return {
+					success: false,
+					error: "llm_stream_error",
+					errorString: error.message || error.toString(),
+				}
 			}
 		}
 
diff --git a/evals/diff_editing/TestRunner.ts b/evals/diff_editing/TestRunner.ts
@@ -22,12 +22,14 @@ const systemPromptGeneratorLookup: Record<string, ConstructSystemPromptFn> = {
 type TestResultSet = { [test_id: string]: (TestResult & { test_id?: string })[] }
 
 class NodeTestRunner {
-	private apiKey: string
+	private apiKey: string | undefined
 
-	constructor() {
-		this.apiKey = process.env.OPENROUTER_API_KEY!
-		if (!this.apiKey) {
-			throw new Error("OPENROUTER_API_KEY environment variable not set")
+	constructor(isReplay: boolean) {
+		if (!isReplay) {
+			this.apiKey = process.env.OPENROUTER_API_KEY
+			if (!this.apiKey) {
+				throw new Error("OPENROUTER_API_KEY environment variable not set for a non-replay run.")
+			}
 		}
 	}
 
@@ -125,6 +127,14 @@ class NodeTestRunner {
 	 * Run a single test example
 	 */
 	async runSingleTest(testCase: ProcessedTestCase, testConfig: TestConfig): Promise<TestResult> {
+		if (testConfig.replay && !testCase.original_diff_edit_tool_call_message) {
+			return {
+				success: false,
+				error: "missing_original_diff_edit_tool_call_message",
+				errorString: `Test case ${testCase.test_id} is missing 'original_diff_edit_tool_call_message' for replay.`,
+			}
+		}
+
 		const customSystemPrompt = this.constructSystemPrompt(testCase.system_prompt_details, testConfig.system_prompt_name)
 
 		// messages don't include system prompt and are everything up to the first replace_in_file tool call which results in a diff edit error
@@ -138,6 +148,7 @@ class NodeTestRunner {
 			parsingFunction: testConfig.parsing_function,
 			diffEditFunction: testConfig.diff_edit_function,
 			thinkingBudgetTokens: testConfig.thinking_tokens_budget,
+			originalDiffEditToolCallMessage: testConfig.replay ? testCase.original_diff_edit_tool_call_message : undefined,
 		}
 
 		return await runSingleEvaluation(input)
@@ -320,6 +331,7 @@ async function main() {
 		.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
 		.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
 		.option("--parallel", "Run tests in parallel", false)
+		.option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
 		.option("-v, --verbose", "Enable verbose logging", false)
 
 	program.parse(process.argv)
@@ -336,12 +348,13 @@ async function main() {
 		parsing_function: options.parsingFunction,
 		diff_edit_function: options.diffEditFunction,
 		thinking_tokens_budget: parseInt(options.thinkingBudget, 10),
+		replay: options.replay,
 	}
 
 	try {
 		const startTime = Date.now()
 
-		const runner = new NodeTestRunner()
+		const runner = new NodeTestRunner(testConfig.replay)
 		const testCases = runner.loadTestCases(testPath)
 
 		const processedTestCases: ProcessedTestCase[] = testCases.map((tc) => ({
@@ -351,6 +364,9 @@ async function main() {
 
 		log(isVerbose, `-Loaded ${testCases.length} test cases.`)
 		log(isVerbose, `-Executing ${testConfig.number_of_runs} run(s) per test case.`)
+		if (testConfig.replay) {
+			log(isVerbose, `-Running in REPLAY mode. No API calls will be made.`)
+		}
 		log(isVerbose, "Starting tests...\n")
 
 		const results = options.parallel
diff --git a/evals/diff_editing/diff-apply/diff-06-06-25.ts b/evals/diff_editing/diff-apply/diff-06-06-25.ts
@@ -211,7 +211,7 @@ export async function constructNewFileContent(
 	diffContent: string,
 	originalContent: string,
 	isFinal: boolean,
-	version: "v1" | "v2" = "v2",
+	version: "v1" | "v2" = "v1",
 ): Promise<string> {
 	const constructor = constructNewFileContentVersionMapping[version]
 	if (!constructor) {
diff --git a/evals/diff_editing/types.ts b/evals/diff_editing/types.ts
@@ -13,6 +13,7 @@ export interface ProcessedTestCase {
 	file_contents: string
 	file_path: string
 	system_prompt_details: SystemPromptDetails
+	original_diff_edit_tool_call_message: string
 }
 
 export interface TestCase {
@@ -21,6 +22,7 @@ export interface TestCase {
 	file_contents: string
 	file_path: string
 	system_prompt_details: SystemPromptDetails
+	original_diff_edit_tool_call_message: string
 }
 
 export interface TestConfig {
@@ -30,6 +32,7 @@ export interface TestConfig {
 	parsing_function: string
 	diff_edit_function: string
 	thinking_tokens_budget: number
+	replay: boolean
 }
 
 export interface SystemPromptDetails {
@@ -72,7 +75,7 @@ export interface ExtractedToolCall {
 }
 
 export interface TestInput {
-	apiKey: string
+	apiKey?: string
 	systemPrompt: string
 	messages: Anthropic.Messages.MessageParam[]
 	modelId: string
@@ -81,4 +84,5 @@ export interface TestInput {
 	parsingFunction: string
 	diffEditFunction: string
 	thinkingBudgetTokens: number
+	originalDiffEditToolCallMessage?: string
 }

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ interface RunDiffEvalOptions {`
`13`	`13`	`verbose: boolean`
`14`	`14`	`testPath: string`
`15`	`15`	`outputPath: string`
	`16`	`+ replay: boolean`
`16`	`17`	`}`
`17`	`18`
`18`	`19`	`export async function runDiffEvalHandler(options: RunDiffEvalOptions) {`
`@@ -50,6 +51,10 @@ export async function runDiffEvalHandler(options: RunDiffEvalOptions) {`
`50`	`51`	`args.push("--parallel")`
`51`	`52`	`}`
`52`	`53`
	`54`	`+ if (options.replay) {`
	`55`	`+ args.push("--replay")`
	`56`	`+ }`
	`57`	`+`
`53`	`58`	`if (options.verbose) {`
`54`	`59`	`args.push("--verbose")`
`55`	`60`	`}`