Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ You can customize the `web-codegen-scorer eval` script with the following flags:
- `--mcp`: Whether to start an MCP for the evaluation. Defaults to `false`.
- Example: `web-codegen-scorer eval --mcp --env=<config path>`

-- `--max-build-repair-attempts`: Number of repair attempts when build errors are discovered. Defaults to `1` attempt.

- `--help`: Prints out usage information about the script.

### Additional configuration options
Expand Down
1 change: 1 addition & 0 deletions runner/configuration/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
* Number of times we'll try to ask LLM to repair a build failure,
* providing the build output and the code that causes the problem.
*/
// Note: When updating, also adjust the default description in `README.md`.
export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;

/** Name of the folder where we store all generated reports */
Expand Down
8 changes: 8 additions & 0 deletions runner/eval-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import chalk from 'chalk';
import {
BUILT_IN_ENVIRONMENTS,
DEFAULT_AUTORATER_MODEL_NAME,
DEFAULT_MAX_REPAIR_ATTEMPTS,
DEFAULT_MODEL_NAME,
} from './configuration/constants.js';
import {generateCodeAndAssess} from './orchestration/generate.js';
Expand Down Expand Up @@ -39,6 +40,7 @@ interface Options {
a11yRepairAttempts?: number;
logging?: 'text-only' | 'dynamic';
skipLighthouse?: boolean;
maxBuildRepairAttempts?: number;
}

function builder(argv: Argv): Argv<Options> {
Expand Down Expand Up @@ -159,6 +161,11 @@ function builder(argv: Argv): Argv<Options> {
default: false,
description: 'Whether to skip collecting Lighthouse data',
})
.option('max-build-repair-attempts', {
type: 'number',
default: DEFAULT_MAX_REPAIR_ATTEMPTS,
description: 'Number of repair attempts when build errors are discovered',
})
.strict()
.version(false)
.help()
Expand Down Expand Up @@ -204,6 +211,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
skipAiSummary: cliArgs.skipAiSummary,
a11yRepairAttempts: cliArgs.a11yRepairAttempts,
skipLighthouse: cliArgs.skipLighthouse,
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
});

logReportToConsole(runInfo);
Expand Down
6 changes: 3 additions & 3 deletions runner/orchestration/build-serve-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ import {
LlmContextFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {runBuild} from './build-worker.js';
import {repairAndBuild} from './build-repair.js';
import {EvalID, Executor} from './executors/executor.js';
import {EvalID} from './executors/executor.js';
import {serveAndTestApp} from './serve-testing-worker.js';
import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';

/**
* Attempts to build the code that an LLM generated. If the build fails, attempts
Expand Down Expand Up @@ -59,7 +59,7 @@ export async function attemptBuild(
);
let repairAttempts = 0;
const maxRepairAttempts = (await env.executor.shouldRepairFailedBuilds(evalID))
? DEFAULT_MAX_REPAIR_ATTEMPTS
? (config.maxBuildRepairAttempts ?? DEFAULT_MAX_REPAIR_ATTEMPTS)
: 0;

const initialAttempt = {
Expand Down
1 change: 1 addition & 0 deletions runner/shared-interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export interface AssessmentConfig {
autoraterModel?: string;
a11yRepairAttempts?: number;
skipLighthouse?: boolean;
maxBuildRepairAttempts?: number;
}

/**
Expand Down
Loading