Skip to content

Commit 335086e

Browse files
authored
diff edit evals (RooCodeInc#4112)
* base * multi parallel * function registry * nit * basic prompt * support data 1 * types * types * input format * path name * use helpers * logs * claude4 prompt * handling * evals history * verbose * prints * var * cli base * cli inputs * sqlite * v3 diff apply * thinking tokens * more metrics * nit * new structure * print
1 parent 983d14b commit 335086e

File tree

13 files changed

+4017
-9
lines changed

13 files changed

+4017
-9
lines changed

evals/.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
repositories
22

3-
results/evals.db
3+
results/evals.db
4+
5+
diff_editing/test_cases/
6+
diff_editing/test_outputs/

evals/cli/package-lock.json

Lines changed: 8 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

evals/cli/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"author": "",
1818
"license": "MIT",
1919
"dependencies": {
20-
"better-sqlite3": "^8.0.0",
20+
"better-sqlite3": "^11.10.0",
2121
"chalk": "^4.1.2",
2222
"commander": "^9.4.1",
2323
"execa": "^5.1.1",
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import execa from "execa"
2+
import chalk from "chalk"
3+
import path from "path"
4+
5+
interface RunDiffEvalOptions {
6+
modelId: string
7+
systemPromptName: string
8+
numberOfRuns: number
9+
parsingFunction: string
10+
diffEditFunction: string
11+
thinkingBudget: number
12+
parallel: boolean
13+
verbose: boolean
14+
testPath: string
15+
outputPath: string
16+
}
17+
18+
export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
19+
console.log(chalk.blue("Starting diff editing evaluation..."))
20+
21+
// Resolve the path to the TestRunner.ts script relative to the current file
22+
const scriptPath = path.resolve(__dirname, "../../../diff_editing/TestRunner.ts")
23+
24+
// Construct the arguments array for the execa call
25+
const args = [
26+
"--model-id",
27+
options.modelId,
28+
"--system-prompt-name",
29+
options.systemPromptName,
30+
"--number-of-runs",
31+
String(options.numberOfRuns),
32+
"--parsing-function",
33+
options.parsingFunction,
34+
"--diff-edit-function",
35+
options.diffEditFunction,
36+
]
37+
38+
// Conditionally add the optional arguments
39+
if (options.testPath) {
40+
args.push("--test-path", options.testPath)
41+
}
42+
if (options.outputPath) {
43+
args.push("--output-path", options.outputPath)
44+
}
45+
if (options.thinkingBudget > 0) {
46+
args.push("--thinking-budget", String(options.thinkingBudget))
47+
}
48+
49+
if (options.parallel) {
50+
args.push("--parallel")
51+
}
52+
53+
if (options.verbose) {
54+
args.push("--verbose")
55+
}
56+
57+
try {
58+
console.log(chalk.gray(`Executing: npx tsx ${scriptPath} ${args.join(" ")}`))
59+
60+
// Execute the script as a child process
61+
// We use 'inherit' to stream the stdout/stderr directly to the user's terminal
62+
const subprocess = execa("npx", ["tsx", scriptPath, ...args], {
63+
stdio: "inherit",
64+
})
65+
66+
await subprocess
67+
68+
console.log(chalk.green("Diff editing evaluation completed successfully."))
69+
} catch (error) {
70+
console.error(chalk.red("An error occurred during the diff editing evaluation."))
71+
// The 'inherit' stdio will have already printed the error details from the script
72+
process.exit(1)
73+
}
74+
}

evals/cli/src/index.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { setupHandler } from "./commands/setup"
55
import { runHandler } from "./commands/run"
66
import { reportHandler } from "./commands/report"
77
import { evalsEnvHandler } from "./commands/evals-env"
8+
import { runDiffEvalHandler } from "./commands/runDiffEval"
89

910
// Create the CLI program
1011
const program = new Command()
@@ -77,6 +78,35 @@ program
7778
}
7879
})
7980

81+
// Run-diff-eval command
82+
program
83+
.command("run-diff-eval")
84+
.description("Run the diff editing evaluation suite")
85+
.option("--test-path <path>", "Path to the directory containing test case JSON files")
86+
.option("--output-path <path>", "Path to the directory to save the test output JSON files")
87+
.option("--model-id <model_id>", "The model ID to use for the test")
88+
.option("--system-prompt-name <name>", "The name of the system prompt to use", "basicSystemPrompt")
89+
.option("-n, --number-of-runs <number>", "Number of times to run each test case", "1")
90+
.option("--parsing-function <name>", "The parsing function to use", "parseAssistantMessageV2")
91+
.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
92+
.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
93+
.option("--parallel", "Run tests in parallel", false)
94+
.option("-v, --verbose", "Enable verbose logging", false)
95+
.action(async (options) => {
96+
try {
97+
// The logic here simplifies slightly
98+
const fullOptions = {
99+
...options,
100+
numberOfRuns: parseInt(options.numberOfRuns, 10),
101+
thinkingBudget: parseInt(options.thinkingBudget, 10),
102+
}
103+
await runDiffEvalHandler(fullOptions)
104+
} catch (error) {
105+
console.error(chalk.red(`Error during diff eval run: ${error instanceof Error ? error.message : String(error)}`))
106+
process.exit(1)
107+
}
108+
})
109+
80110
// Parse command line arguments
81111
program.parse(process.argv)
82112

0 commit comments

Comments
 (0)