diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 153dd5f..9e7d7b4 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -155,9 +155,10 @@ export async function generateCodeAndAssess(options: { appConcurrencyQueue.add( async () => { const evalID = await env.gateway.initializeEval(); + let results: AssessmentResult[] | undefined; try { - return await callWithTimeout( + results = await callWithTimeout( `Evaluation of ${rootPromptDef.name}`, async abortSignal => startEvaluationTask( @@ -183,6 +184,7 @@ export async function generateCodeAndAssess(options: { // 10min max per app evaluation. We just want to make sure it never gets stuck. 10, ); + return results; } catch (e: unknown) { failedPrompts.push({ promptName: rootPromptDef.name, @@ -198,8 +200,7 @@ export async function generateCodeAndAssess(options: { progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details); return [] satisfies AssessmentResult[]; } finally { - progress.log(rootPromptDef, 'done', 'Done'); - + progress.evalFinished(rootPromptDef, results || []); await env.gateway.finalizeEval(evalID); } }, diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts index 6e7e9a6..949cf96 100644 --- a/runner/progress/dynamic-progress-logger.ts +++ b/runner/progress/dynamic-progress-logger.ts @@ -1,6 +1,6 @@ import {MultiBar, SingleBar, Presets} from 'cli-progress'; import chalk from 'chalk'; -import {RootPromptDefinition} from '../shared-interfaces.js'; +import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; import {ProgressLogger, ProgressType, progressTypeToIcon} from './progress-logger.js'; import {redX} from '../reporting/format.js'; @@ -13,6 +13,8 @@ export class DynamicProgressLogger implements ProgressLogger { private pendingBars = new Map(); private spinnerFrames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; private currentSpinnerFrame = 0; + private completedEvals = 0; + private totalScore = 0; private spinnerInterval: ReturnType | undefined; private errors: { prompt: RootPromptDefinition; @@ -46,10 +48,17 @@ export class DynamicProgressLogger implements ProgressLogger { ); // Bar that tracks how many prompts are completed in total. - this.totalBar = this.wrapper.create(total, 0, undefined, { - format: '{bar} {spinner} {value}/{total} prompts completed', - barsize: PREFIX_WIDTH, - }); + this.totalBar = this.wrapper.create( + total, + 0, + { + additionalInfo: '', + }, + { + format: '{bar} {spinner} {value}/{total} prompts completed{additionalInfo}', + barsize: PREFIX_WIDTH, + }, + ); // Interval to update the spinner. this.spinnerInterval = setInterval(() => { @@ -74,6 +83,7 @@ export class DynamicProgressLogger implements ProgressLogger { this.wrapper?.stop(); this.pendingBars.clear(); this.wrapper = this.totalBar = this.spinnerInterval = undefined; + this.completedEvals = this.totalScore = 0; for (const error of this.errors) { let message = `${redX()} [${error.prompt.name}] ${error.message}`; @@ -91,17 +101,6 @@ export class DynamicProgressLogger implements ProgressLogger { let bar = this.pendingBars.get(prompt); - // Drop the bar from the screen if it's complete. - if (type === 'done') { - this.pendingBars.delete(prompt); - - if (bar) { - this.totalBar.increment(); - this.wrapper.remove(bar); - } - return; - } - // Capture errors for static printing once the dynamic progress is hidden. if (type === 'error') { this.errors.push({prompt, message, details}); @@ -117,14 +116,36 @@ export class DynamicProgressLogger implements ProgressLogger { if (bar) { bar.update(0, payload); } else { - const bar = this.wrapper.create(1, 0, payload); + bar = this.wrapper.create(1, 0, payload); this.pendingBars.set(prompt, bar); } } + evalFinished(prompt: RootPromptDefinition, results: AssessmentResult[]): void { + const bar = this.pendingBars.get(prompt); + this.pendingBars.delete(prompt); + + for (const result of results) { + this.completedEvals++; + this.totalScore += (result.score.totalPoints / result.score.maxOverallPoints) * 100; + } + + if (this.completedEvals > 0) { + this.totalBar?.increment(1, { + additionalInfo: `, ${Math.round(this.totalScore / this.completedEvals)}% score on average`, + }); + } else { + this.totalBar?.increment(); + } + + // Drop the bar from the screen if it's complete. + if (bar) { + this.wrapper?.remove(bar); + } + } + private getColorFunction(type: ProgressType): (value: string) => string { switch (type) { - case 'done': case 'success': case 'serve-testing': case 'build': diff --git a/runner/progress/noop-progress-logger.ts b/runner/progress/noop-progress-logger.ts index 1b0f815..5f11485 100644 --- a/runner/progress/noop-progress-logger.ts +++ b/runner/progress/noop-progress-logger.ts @@ -5,4 +5,5 @@ export class NoopProgressLogger implements ProgressLogger { initialize(): void {} finalize(): void {} log(): void {} + evalFinished(): void {} } diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts index 820c7a6..c888aba 100644 --- a/runner/progress/progress-logger.ts +++ b/runner/progress/progress-logger.ts @@ -1,15 +1,8 @@ import {greenCheckmark, redX} from '../reporting/format.js'; -import {RootPromptDefinition} from '../shared-interfaces.js'; +import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js'; /** Possible progress event types. */ -export type ProgressType = - | 'codegen' - | 'build' - | 'serve-testing' - | 'success' - | 'error' - | 'eval' - | 'done'; +export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval'; /** Maps a ProgressType to an icon that can represent it. */ export function progressTypeToIcon(type: ProgressType): string { @@ -27,8 +20,6 @@ export function progressTypeToIcon(type: ProgressType): string { return redX(); case 'eval': return '🔎'; - case 'done': - return '🏁'; } } @@ -43,6 +34,13 @@ export interface ProgressLogger { /** Stops the logging process. */ finalize(): void; + /** + * Logs when an individual eval has finished. + * @param prompt Prompt associated with the event. + * @param results Assessment results for the prompt. + */ + evalFinished(prompt: RootPromptDefinition, results: AssessmentResult[]): void; + /** * Logs a progress event to the logger. * @param prompt Prompt associated with the event. diff --git a/runner/progress/text-progress-logger.ts b/runner/progress/text-progress-logger.ts index 3ecd959..26af196 100644 --- a/runner/progress/text-progress-logger.ts +++ b/runner/progress/text-progress-logger.ts @@ -17,13 +17,11 @@ export class TextProgressLogger implements ProgressLogger { log(prompt: RootPromptDefinition, type: ProgressType, message: string, details?: string): void { const icon = progressTypeToIcon(type); - - if (type === 'done') { - // It's handy to know how many apps are done when one completes. - const suffix = `(${++this.done}/${this.total})`; - details = details ? `${details} ${suffix}` : suffix; - } - console.log(`[${prompt.name}] ${icon} ${message} ${details || ''}`.trim()); } + + evalFinished(prompt: RootPromptDefinition): void { + // It's handy to know how many apps are done when one completes. + console.log(`[${prompt.name}] 🏁 Done (${++this.done}/${this.total})`.trim()); + } } diff --git a/runner/run-cli.ts b/runner/run-cli.ts index 73a44b6..2183fd5 100644 --- a/runner/run-cli.ts +++ b/runner/run-cli.ts @@ -178,6 +178,7 @@ async function getPossiblePrompts(environmentDir: string): Promise { class ErrorOnlyProgressLogger implements ProgressLogger { initialize(): void {} finalize(): void {} + evalFinished(): void {} log(_: unknown, type: ProgressType, message: string, details?: string) { if (type === 'error') {