Skip to content

Commit 336eb46

Browse files
authored
replay evals (RooCodeInc#4140)
1 parent cdfffb8 commit 336eb46

File tree

6 files changed

+55
-19
lines changed

6 files changed

+55
-19
lines changed

evals/cli/src/commands/runDiffEval.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ interface RunDiffEvalOptions {
1313
verbose: boolean
1414
testPath: string
1515
outputPath: string
16+
replay: boolean
1617
}
1718

1819
export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
@@ -50,6 +51,10 @@ export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
5051
args.push("--parallel")
5152
}
5253

54+
if (options.replay) {
55+
args.push("--replay")
56+
}
57+
5358
if (options.verbose) {
5459
args.push("--verbose")
5560
}

evals/cli/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ program
9191
.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
9292
.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
9393
.option("--parallel", "Run tests in parallel", false)
94+
.option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
9495
.option("-v, --verbose", "Enable verbose logging", false)
9596
.action(async (options) => {
9697
try {

evals/diff_editing/ClineWrapper.ts

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
parseAssistantMessageV3,
99
AssistantMessageContent,
1010
} from "./parsing/parse-assistant-message-06-06-25" // "../../src/core/assistant-message"
11-
import { constructNewFileContentV2 } from "./diff-apply/diff-06-06-25"
11+
import { constructNewFileContent as constructNewFileContentV1, constructNewFileContentV2 } from "./diff-apply/diff-06-06-25"
1212
import { constructNewFileContent as constructNewFileContentV3 } from "../../src/core/assistant-message/diff" // this defaults to the new v1 when called
1313

1414
type ParseAssistantMessageFn = (message: string) => AssistantMessageContent[]
@@ -21,6 +21,7 @@ const parsingFunctions: Record<string, ParseAssistantMessageFn> = {
2121
}
2222

2323
const diffEditingFunctions: Record<string, ConstructNewFileContentFn> = {
24+
constructNewFileContentV1: constructNewFileContentV1,
2425
constructNewFileContentV2: constructNewFileContentV2,
2526
constructNewFileContentV3: constructNewFileContentV3, // position invariant diff
2627
}
@@ -114,10 +115,10 @@ export async function runSingleEvaluation(input: TestInput): Promise<TestResult>
114115
parsingFunction,
115116
diffEditFunction,
116117
thinkingBudgetTokens,
118+
originalDiffEditToolCallMessage,
117119
} = input
118120

119121
const requiredParams = {
120-
apiKey,
121122
systemPrompt,
122123
messages,
123124
modelId,
@@ -163,17 +164,26 @@ export async function runSingleEvaluation(input: TestInput): Promise<TestResult>
163164
},
164165
}
165166

166-
const openRouterHandler = new OpenRouterHandler(options)
167-
168167
// Get the output of streaming output of this llm call
169168
let streamResult: StreamResult
170-
try {
171-
streamResult = await processStream(openRouterHandler, systemPrompt, messages)
172-
} catch (error: any) {
173-
return {
174-
success: false,
175-
error: "llm_stream_error",
176-
errorString: error.message || error.toString(),
169+
if (originalDiffEditToolCallMessage !== undefined) {
170+
// Replay mode: mock the stream result
171+
streamResult = {
172+
assistantMessage: originalDiffEditToolCallMessage,
173+
reasoningMessage: "",
174+
usage: { inputTokens: 0, outputTokens: 0, cacheWriteTokens: 0, cacheReadTokens: 0, totalCost: 0 },
175+
}
176+
} else {
177+
// Live mode: existing API call logic
178+
try {
179+
const openRouterHandler = new OpenRouterHandler(options)
180+
streamResult = await processStream(openRouterHandler, systemPrompt, messages)
181+
} catch (error: any) {
182+
return {
183+
success: false,
184+
error: "llm_stream_error",
185+
errorString: error.message || error.toString(),
186+
}
177187
}
178188
}
179189

evals/diff_editing/TestRunner.ts

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ const systemPromptGeneratorLookup: Record<string, ConstructSystemPromptFn> = {
2222
type TestResultSet = { [test_id: string]: (TestResult & { test_id?: string })[] }
2323

2424
class NodeTestRunner {
25-
private apiKey: string
25+
private apiKey: string | undefined
2626

27-
constructor() {
28-
this.apiKey = process.env.OPENROUTER_API_KEY!
29-
if (!this.apiKey) {
30-
throw new Error("OPENROUTER_API_KEY environment variable not set")
27+
constructor(isReplay: boolean) {
28+
if (!isReplay) {
29+
this.apiKey = process.env.OPENROUTER_API_KEY
30+
if (!this.apiKey) {
31+
throw new Error("OPENROUTER_API_KEY environment variable not set for a non-replay run.")
32+
}
3133
}
3234
}
3335

@@ -125,6 +127,14 @@ class NodeTestRunner {
125127
* Run a single test example
126128
*/
127129
async runSingleTest(testCase: ProcessedTestCase, testConfig: TestConfig): Promise<TestResult> {
130+
if (testConfig.replay && !testCase.original_diff_edit_tool_call_message) {
131+
return {
132+
success: false,
133+
error: "missing_original_diff_edit_tool_call_message",
134+
errorString: `Test case ${testCase.test_id} is missing 'original_diff_edit_tool_call_message' for replay.`,
135+
}
136+
}
137+
128138
const customSystemPrompt = this.constructSystemPrompt(testCase.system_prompt_details, testConfig.system_prompt_name)
129139

130140
// messages don't include system prompt and are everything up to the first replace_in_file tool call which results in a diff edit error
@@ -138,6 +148,7 @@ class NodeTestRunner {
138148
parsingFunction: testConfig.parsing_function,
139149
diffEditFunction: testConfig.diff_edit_function,
140150
thinkingBudgetTokens: testConfig.thinking_tokens_budget,
151+
originalDiffEditToolCallMessage: testConfig.replay ? testCase.original_diff_edit_tool_call_message : undefined,
141152
}
142153

143154
return await runSingleEvaluation(input)
@@ -320,6 +331,7 @@ async function main() {
320331
.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
321332
.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
322333
.option("--parallel", "Run tests in parallel", false)
334+
.option("--replay", "Run evaluation from a pre-recorded LLM output, skipping the API call", false)
323335
.option("-v, --verbose", "Enable verbose logging", false)
324336

325337
program.parse(process.argv)
@@ -336,12 +348,13 @@ async function main() {
336348
parsing_function: options.parsingFunction,
337349
diff_edit_function: options.diffEditFunction,
338350
thinking_tokens_budget: parseInt(options.thinkingBudget, 10),
351+
replay: options.replay,
339352
}
340353

341354
try {
342355
const startTime = Date.now()
343356

344-
const runner = new NodeTestRunner()
357+
const runner = new NodeTestRunner(testConfig.replay)
345358
const testCases = runner.loadTestCases(testPath)
346359

347360
const processedTestCases: ProcessedTestCase[] = testCases.map((tc) => ({
@@ -351,6 +364,9 @@ async function main() {
351364

352365
log(isVerbose, `-Loaded ${testCases.length} test cases.`)
353366
log(isVerbose, `-Executing ${testConfig.number_of_runs} run(s) per test case.`)
367+
if (testConfig.replay) {
368+
log(isVerbose, `-Running in REPLAY mode. No API calls will be made.`)
369+
}
354370
log(isVerbose, "Starting tests...\n")
355371

356372
const results = options.parallel

evals/diff_editing/diff-apply/diff-06-06-25.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ export async function constructNewFileContent(
211211
diffContent: string,
212212
originalContent: string,
213213
isFinal: boolean,
214-
version: "v1" | "v2" = "v2",
214+
version: "v1" | "v2" = "v1",
215215
): Promise<string> {
216216
const constructor = constructNewFileContentVersionMapping[version]
217217
if (!constructor) {

evals/diff_editing/types.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export interface ProcessedTestCase {
1313
file_contents: string
1414
file_path: string
1515
system_prompt_details: SystemPromptDetails
16+
original_diff_edit_tool_call_message: string
1617
}
1718

1819
export interface TestCase {
@@ -21,6 +22,7 @@ export interface TestCase {
2122
file_contents: string
2223
file_path: string
2324
system_prompt_details: SystemPromptDetails
25+
original_diff_edit_tool_call_message: string
2426
}
2527

2628
export interface TestConfig {
@@ -30,6 +32,7 @@ export interface TestConfig {
3032
parsing_function: string
3133
diff_edit_function: string
3234
thinking_tokens_budget: number
35+
replay: boolean
3336
}
3437

3538
export interface SystemPromptDetails {
@@ -72,7 +75,7 @@ export interface ExtractedToolCall {
7275
}
7376

7477
export interface TestInput {
75-
apiKey: string
78+
apiKey?: string
7679
systemPrompt: string
7780
messages: Anthropic.Messages.MessageParam[]
7881
modelId: string
@@ -81,4 +84,5 @@ export interface TestInput {
8184
parsingFunction: string
8285
diffEditFunction: string
8386
thinkingBudgetTokens: number
87+
originalDiffEditToolCallMessage?: string
8488
}

0 commit comments

Comments
 (0)