Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions runner/orchestration/build-serve-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ import PQueue from 'p-queue';
import {LlmGenerateFilesResponse} from '../codegen/llm-runner.js';
import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {Environment} from '../configuration/environment.js';
import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-interfaces.js';
import {
AssessmentConfig,
AttemptDetails,
LlmContextFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {runBuild} from './build-worker.js';
Expand Down Expand Up @@ -31,9 +36,9 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
* @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
*/
export async function attemptBuild(
config: AssessmentConfig,
evalID: EvalID,
gateway: Gateway<Environment>,
model: string,
env: Environment,
rootPromptDef: RootPromptDefinition,
directory: string,
Expand All @@ -43,12 +48,7 @@ export async function attemptBuild(
abortSignal: AbortSignal,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
skipScreenshots: boolean,
skipAxeTesting: boolean,
enableAutoCsp: boolean,
skipLighthouse: boolean,
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
maxAxeRepairAttempts: number,
) {
const initialBuildResult = await runBuild(
evalID,
Expand Down Expand Up @@ -93,7 +93,7 @@ export async function attemptBuild(
const attempt = await repairAndBuild(
evalID,
gateway,
model,
config.model,
env,
rootPromptDef,
directory,
Expand All @@ -115,6 +115,7 @@ export async function attemptBuild(
// Now that we got a working app, try to serve it and collect
// findings from the running app.
lastAttempt.serveTestingResult = await serveAndTestApp(
config,
evalID,
gateway,
directory,
Expand All @@ -123,10 +124,6 @@ export async function attemptBuild(
workerConcurrencyQueue,
abortSignal,
progress,
skipScreenshots,
skipAxeTesting,
enableAutoCsp,
skipLighthouse,
userJourneyAgentTaskInput,
);
}
Expand All @@ -138,7 +135,7 @@ export async function attemptBuild(
while (
lastAttempt.serveTestingResult &&
(lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
axeRepairAttempts < maxAxeRepairAttempts
axeRepairAttempts < (config.a11yRepairAttempts ?? 0)
) {
axeRepairAttempts++;
progress.log(
Expand All @@ -158,7 +155,7 @@ export async function attemptBuild(
const attempt = await repairAndBuild(
evalID,
gateway,
model,
config.model,
env,
rootPromptDef,
directory,
Expand All @@ -185,6 +182,7 @@ export async function attemptBuild(
// Re-run serving & tests after Axe repair.
// This allows us to check if we fixed the violations.
attempt.serveTestingResult = await serveAndTestApp(
config,
evalID,
gateway,
directory,
Expand All @@ -193,10 +191,6 @@ export async function attemptBuild(
workerConcurrencyQueue,
abortSignal,
progress,
skipScreenshots,
skipAxeTesting,
enableAutoCsp,
skipLighthouse,
userJourneyAgentTaskInput,
);

Expand Down
91 changes: 21 additions & 70 deletions runner/orchestration/generate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {Environment} from '../configuration/environment.js';
import {rateGeneratedCode} from '../ratings/rate-code.js';
import {redX} from '../reporting/format.js';
import {
AssessmentConfig,
AssessmentResult,
AttemptDetails,
CompletionStats,
Expand Down Expand Up @@ -49,7 +50,7 @@ import {getRunGroupId} from './grouping.js';
import {executeCommand} from '../utils/exec.js';
import {EvalID, Gateway} from './gateway.js';
import {LocalEnvironment} from '../configuration/environment-local.js';
import {getRunnerByName, RunnerName} from '../codegen/runner-creation.js';
import {getRunnerByName} from '../codegen/runner-creation.js';
import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';

/**
Expand All @@ -64,29 +65,7 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
* @returns A Promise that resolves to an array of AssessmentResult objects,
* each containing the prompt, generated code, and final validation status.
*/
export async function generateCodeAndAssess(options: {
model: string;
runner: RunnerName;
environmentConfigPath: string;
localMode: boolean;
limit: number;
concurrency: number | 'auto';
reportName: string;
skipScreenshots: boolean;
startMcp?: boolean;
ragEndpoint?: string;
outputDirectory?: string;
promptFilter?: string;
labels: string[];
skipAiSummary?: boolean;
skipAxeTesting: boolean;
enableUserJourneyTesting?: boolean;
enableAutoCsp?: boolean;
logging?: 'text-only' | 'dynamic';
autoraterModel?: string;
a11yRepairAttempts?: number;
skipLighthouse?: boolean;
}): Promise<RunInfo> {
export async function generateCodeAndAssess(options: AssessmentConfig): Promise<RunInfo> {
const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
const ratingLlm = await getRunnerByName('genkit');

Expand Down Expand Up @@ -162,25 +141,15 @@ export async function generateCodeAndAssess(options: {
`Evaluation of ${rootPromptDef.name}`,
async abortSignal =>
startEvaluationTask(
options,
evalID,
env,
env.gateway,
ratingLlm,
options.model,
rootPromptDef,
options.localMode,
options.skipScreenshots,
options.outputDirectory,
options.ragEndpoint,
abortSignal,
options.skipAxeTesting,
!!options.enableUserJourneyTesting,
!!options.enableAutoCsp,
workerConcurrencyQueue,
progress,
options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
options.a11yRepairAttempts ?? 0,
!!options.skipLighthouse,
),
// 10min max per app evaluation. We just want to make sure it never gets stuck.
10,
Expand Down Expand Up @@ -291,40 +260,30 @@ export async function generateCodeAndAssess(options: {
* @returns A Promise that resolves to an AssessmentResult object containing all details of the task's execution.
*/
async function startEvaluationTask(
config: AssessmentConfig,
evalID: EvalID,
env: Environment,
gateway: Gateway<Environment>,
ratingLlm: GenkitRunner,
model: string,
rootPromptDef: PromptDefinition | MultiStepPromptDefinition,
localMode: boolean,
skipScreenshots: boolean,
outputDirectory: string | undefined,
ragEndpoint: string | undefined,
abortSignal: AbortSignal,
skipAxeTesting: boolean,
enableUserJourneyTesting: boolean,
enableAutoCsp: boolean,
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
autoraterModel: string,
a11yRepairAttempts: number,
skipLighthouse: boolean,
): Promise<AssessmentResult[]> {
// Set up the project structure once for the root project.
const {directory, cleanup} = await setupProjectStructure(
env,
rootPromptDef,
progress,
outputDirectory,
config.outputDirectory,
);

const results: AssessmentResult[] = [];
const defsToExecute = rootPromptDef.kind === 'single' ? [rootPromptDef] : rootPromptDef.steps;

for (const promptDef of defsToExecute) {
const [fullPromptText, systemInstructions] = await Promise.all([
env.getPrompt(promptDef.systemPromptType, promptDef.prompt, ragEndpoint),
env.getPrompt(promptDef.systemPromptType, promptDef.prompt, config.ragEndpoint),
env.getPrompt(promptDef.systemPromptType, ''),
]);

Expand All @@ -334,9 +293,8 @@ async function startEvaluationTask(

// Generate the initial set of files through the LLM.
const initialResponse = await generateInitialFiles(
config,
evalID,
gateway,
model,
env,
promptDef,
{
Expand All @@ -349,7 +307,6 @@ async function startEvaluationTask(
possiblePackageManagers: getPossiblePackageManagers().slice(),
},
contextFiles,
localMode,
abortSignal,
progress,
);
Expand Down Expand Up @@ -406,21 +363,22 @@ async function startEvaluationTask(

// TODO: Only execute the serve command on the "final working attempt".
// TODO: Incorporate usage.
const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined = enableUserJourneyTesting
? {
userJourneys: userJourneys.result,
appPrompt: defsToExecute[0].prompt,
}
: undefined;
const userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined =
config.enableUserJourneyTesting
? {
userJourneys: userJourneys.result,
appPrompt: defsToExecute[0].prompt,
}
: undefined;

const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json

// Try to build the files in the root prompt directory.
// This will also attempt to fix issues with the generated code.
const attempt = await attemptBuild(
config,
evalID,
gateway,
model,
env,
rootPromptDef,
directory,
Expand All @@ -430,12 +388,7 @@ async function startEvaluationTask(
abortSignal,
workerConcurrencyQueue,
progress,
skipScreenshots,
skipAxeTesting,
enableAutoCsp,
skipLighthouse,
userJourneyAgentTaskInput,
a11yRepairAttempts,
);

if (!attempt) {
Expand All @@ -455,7 +408,7 @@ async function startEvaluationTask(
attempt.axeRepairAttempts,
abortSignal,
progress,
autoraterModel,
config.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
);

results.push({
Expand Down Expand Up @@ -493,18 +446,16 @@ async function startEvaluationTask(
* @param abortSignal Signal to fire when this process should be aborted.
*/
async function generateInitialFiles(
options: AssessmentConfig,
evalID: EvalID,
gateway: Gateway<Environment>,
model: string,
env: Environment,
promptDef: RootPromptDefinition,
codegenContext: LlmGenerateFilesContext,
contextFiles: LlmContextFile[],
localMode: boolean,
abortSignal: AbortSignal,
progress: ProgressLogger,
): Promise<LlmGenerateFilesResponse> {
if (localMode) {
if (options.localMode) {
const localFilesDirectory = join(LLM_OUTPUT_DIR, env.id, promptDef.name);
const filePaths = globSync('**/*', {cwd: localFilesDirectory});

Expand All @@ -531,10 +482,10 @@ async function generateInitialFiles(

progress.log(promptDef, 'codegen', 'Generating code with AI');

const response = await gateway.generateInitialFiles(
const response = await env.gateway.generateInitialFiles(
evalID,
codegenContext,
model,
options.model,
contextFiles,
abortSignal,
);
Expand Down
15 changes: 6 additions & 9 deletions runner/orchestration/serve-testing-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import {ChildProcess, fork} from 'node:child_process';
import path from 'node:path';
import {Environment} from '../configuration/environment.js';
import {ProgressLogger} from '../progress/progress-logger.js';
import {RootPromptDefinition} from '../shared-interfaces.js';
import {AssessmentConfig, RootPromptDefinition} from '../shared-interfaces.js';
import {killChildProcessGracefully} from '../utils/kill-gracefully.js';
import {
ServeTestingResult,
Expand All @@ -15,6 +15,7 @@ import PQueue from 'p-queue';

/** Attempts to run & test an eval app. */
export async function serveAndTestApp(
config: AssessmentConfig,
evalID: EvalID,
gateway: Gateway<Environment>,
appDirectoryPath: string,
Expand All @@ -23,10 +24,6 @@ export async function serveAndTestApp(
workerConcurrencyQueue: PQueue,
abortSignal: AbortSignal,
progress: ProgressLogger,
skipScreenshots: boolean,
skipAxeTesting: boolean,
enableAutoCsp: boolean,
skipLighthouse: boolean,
userJourneyAgentTaskInput?: BrowserAgentTaskInput,
): Promise<ServeTestingResult> {
progress.log(rootPromptDef, 'serve-testing', `Testing the app`);
Expand All @@ -41,10 +38,10 @@ export async function serveAndTestApp(
const serveParams: ServeTestingWorkerMessage = {
serveUrl,
appName: rootPromptDef.name,
enableAutoCsp,
includeAxeTesting: skipAxeTesting === false,
takeScreenshots: skipScreenshots === false,
includeLighthouseData: skipLighthouse === false,
enableAutoCsp: !!config.enableAutoCsp,
includeAxeTesting: config.skipAxeTesting === false,
takeScreenshots: config.skipScreenshots === false,
includeLighthouseData: config.skipLighthouse === false,
userJourneyAgentTaskInput,
};

Expand Down
Loading
Loading