SmartManoj
diff --git a/‎evals/.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎evals/.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎evals/cli/package-lock.json‎
Lines changed: 8 additions & 7 deletions b/‎evals/cli/package-lock.json‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎evals/cli/package.json‎
Lines changed: 1 addition & 1 deletion b/‎evals/cli/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎evals/cli/src/commands/runDiffEval.ts‎
Lines changed: 74 additions & 0 deletions b/‎evals/cli/src/commands/runDiffEval.ts‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎evals/cli/src/index.ts‎
Lines changed: 30 additions & 0 deletions b/‎evals/cli/src/index.ts‎
Lines changed: 30 additions & 0 deletions
@@ -1,3 +1,6 @@
 repositories
 
-results/evals.db
+results/evals.db
+
+diff_editing/test_cases/
+diff_editing/test_outputs/
@@ -17,7 +17,7 @@
 	"author": "",
 	"license": "MIT",
 	"dependencies": {
-		"better-sqlite3": "^8.0.0",
+		"better-sqlite3": "^11.10.0",
 		"chalk": "^4.1.2",
 		"commander": "^9.4.1",
 		"execa": "^5.1.1",
 
@@ -0,0 +1,74 @@
+import execa from "execa"
+import chalk from "chalk"
+import path from "path"
+
+interface RunDiffEvalOptions {
+	modelId: string
+	systemPromptName: string
+	numberOfRuns: number
+	parsingFunction: string
+	diffEditFunction: string
+	thinkingBudget: number
+	parallel: boolean
+	verbose: boolean
+	testPath: string
+	outputPath: string
+}
+
+export async function runDiffEvalHandler(options: RunDiffEvalOptions) {
+	console.log(chalk.blue("Starting diff editing evaluation..."))
+
+	// Resolve the path to the TestRunner.ts script relative to the current file
+	const scriptPath = path.resolve(__dirname, "../../../diff_editing/TestRunner.ts")
+
+	// Construct the arguments array for the execa call
+	const args = [
+		"--model-id",
+		options.modelId,
+		"--system-prompt-name",
+		options.systemPromptName,
+		"--number-of-runs",
+		String(options.numberOfRuns),
+		"--parsing-function",
+		options.parsingFunction,
+		"--diff-edit-function",
+		options.diffEditFunction,
+	]
+
+	// Conditionally add the optional arguments
+	if (options.testPath) {
+		args.push("--test-path", options.testPath)
+	}
+	if (options.outputPath) {
+		args.push("--output-path", options.outputPath)
+	}
+	if (options.thinkingBudget > 0) {
+		args.push("--thinking-budget", String(options.thinkingBudget))
+	}
+
+	if (options.parallel) {
+		args.push("--parallel")
+	}
+
+	if (options.verbose) {
+		args.push("--verbose")
+	}
+
+	try {
+		console.log(chalk.gray(`Executing: npx tsx ${scriptPath} ${args.join(" ")}`))
+
+		// Execute the script as a child process
+		// We use 'inherit' to stream the stdout/stderr directly to the user's terminal
+		const subprocess = execa("npx", ["tsx", scriptPath, ...args], {
+			stdio: "inherit",
+		})
+
+		await subprocess
+
+		console.log(chalk.green("Diff editing evaluation completed successfully."))
+	} catch (error) {
+		console.error(chalk.red("An error occurred during the diff editing evaluation."))
+		// The 'inherit' stdio will have already printed the error details from the script
+		process.exit(1)
+	}
+}
@@ -5,6 +5,7 @@ import { setupHandler } from "./commands/setup"
 import { runHandler } from "./commands/run"
 import { reportHandler } from "./commands/report"
 import { evalsEnvHandler } from "./commands/evals-env"
+import { runDiffEvalHandler } from "./commands/runDiffEval"
 
 // Create the CLI program
 const program = new Command()
@@ -77,6 +78,35 @@ program
 		}
 	})
 
+// Run-diff-eval command
+program
+	.command("run-diff-eval")
+	.description("Run the diff editing evaluation suite")
+	.option("--test-path <path>", "Path to the directory containing test case JSON files")
+	.option("--output-path <path>", "Path to the directory to save the test output JSON files")
+	.option("--model-id <model_id>", "The model ID to use for the test")
+	.option("--system-prompt-name <name>", "The name of the system prompt to use", "basicSystemPrompt")
+	.option("-n, --number-of-runs <number>", "Number of times to run each test case", "1")
+	.option("--parsing-function <name>", "The parsing function to use", "parseAssistantMessageV2")
+	.option("--diff-edit-function <name>", "The diff editing function to use", "constructNewFileContentV2")
+	.option("--thinking-budget <tokens>", "Set the thinking tokens budget", "0")
+	.option("--parallel", "Run tests in parallel", false)
+	.option("-v, --verbose", "Enable verbose logging", false)
+	.action(async (options) => {
+		try {
+			// The logic here simplifies slightly
+			const fullOptions = {
+				...options,
+				numberOfRuns: parseInt(options.numberOfRuns, 10),
+				thinkingBudget: parseInt(options.thinkingBudget, 10),
+			}
+			await runDiffEvalHandler(fullOptions)
+		} catch (error) {
+			console.error(chalk.red(`Error during diff eval run: ${error instanceof Error ? error.message : String(error)}`))
+			process.exit(1)
+		}
+	})
+
 // Parse command line arguments
 program.parse(process.argv)