angular
diff --git a/‎docs/environment-reference.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/environment-reference.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎report-app/src/app/pages/report-viewer/report-viewer.html‎
Lines changed: 63 additions & 0 deletions b/‎report-app/src/app/pages/report-viewer/report-viewer.html‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎report-app/src/app/pages/report-viewer/report-viewer.ts‎
Lines changed: 26 additions & 0 deletions b/‎report-app/src/app/pages/report-viewer/report-viewer.ts‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎runner/configuration/constants.ts‎
Lines changed: 6 additions & 0 deletions b/‎runner/configuration/constants.ts‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎runner/configuration/environment-local.ts‎
Lines changed: 7 additions & 0 deletions b/‎runner/configuration/environment-local.ts‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎runner/orchestration/build-repair.ts‎
Lines changed: 2 additions & 23 deletions b/‎runner/orchestration/build-repair.ts‎
Lines changed: 2 additions & 23 deletions
diff --git a/‎runner/orchestration/build-serve-loop.ts‎
Lines changed: 67 additions & 3 deletions b/‎runner/orchestration/build-serve-loop.ts‎
Lines changed: 67 additions & 3 deletions
diff --git a/‎runner/orchestration/codegen.ts‎
Lines changed: 2 additions & 2 deletions b/‎runner/orchestration/codegen.ts‎
Lines changed: 2 additions & 2 deletions
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
 
 Command used to start a local dev server as a part of the evaluation.
 Defaults to `<package manager> run start --port 0`.
+
+### `testCommand`
+
+Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 2 minutes.
+
@@ -76,6 +76,20 @@ <h3 class="chart-title">
             />
           </div>
         </div>
+        @if (overview.stats.tests) {
+          <div class="chart-container test-results-details">
+            <h3 class="chart-title">
+              <span class="material-symbols-outlined"> quiz </span>
+              <span>Tests</span>
+            </h3>
+            <div class="summary-card-item">
+              <stacked-bar-chart
+                [data]="testsAsGraphData(overview.stats.tests)"
+                [compact]="true"
+              />
+            </div>
+          </div>
+        }
         @if (overview.stats.runtime) {
           <div class="chart-container">
             <h3 class="chart-title">
@@ -273,6 +287,17 @@ <h2>Generated applications</h2>
                 @if (initialAttempt?.buildResult?.status === 'error') {
                   <span class="status-badge error">Initial build failed</span>
                 }
+
+                <!-- Test status badges -->
+                @if (finalAttempt.testResult) {
+                  @if (finalAttempt.testResult.passed) {
+                    @if ((result.testRepairAttempts || 0) > 0) {
+                      <span class="status-badge warning">Tests passed after repair</span>
+                    }
+                  } @else {
+                    <span class="status-badge error">Tests failed</span>
+                  }
+                }
               </div>
             </div>
           </expansion-panel-header>
@@ -348,6 +373,29 @@ <h5>
                 </div>
               </div>
 
+              @if (result.testResult) {
+                <div class="app-details-section">
+                  <h4>Test Results</h4>
+                  <div class="test-summary">
+                    @if (result.testResult.passed) {
+                      <span class="status-text success">✔ Tests passed</span>
+                      @if ((result.testRepairAttempts || 0) > 0) {
+                        <span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
+                      }
+                    } @else {
+                      <span class="status-text error">✘ Tests failed</span>
+                    }
+                  </div>
+                  
+                  @if (result.testResult.output && !result.testResult.passed) {
+                    <details class="test-output-button">
+                      <summary class="neutral-button">See Test Output</summary>
+                      <pre class="callout neutral code">{{ result.testResult.output }}</pre>
+                    </details>
+                  }
+                </div>
+              }
+
               <div class="app-details-section">
                 <h4>Additional info</h4>
                 @for (attempt of result.attemptDetails; track attempt) {
@@ -356,6 +404,7 @@ <h4>Additional info</h4>
                     attempt.serveTestingResult?.axeViolations;
                   @let hasAxeViolations =
                     axeViolations && axeViolations.length > 0;
+                  @let testsFailed = attempt.testResult?.passed === false;
 
                   <expansion-panel #expansionPanel>
                     <expansion-panel-header>
@@ -380,6 +429,15 @@ <h4>Additional info</h4>
                           >A11y</span
                         >
                       }
+
+                      @if (attempt.testResult) {
+                        <span
+                          class="status-badge"
+                          [class.error]="!attempt.testResult.passed"
+                          [class.success]="attempt.testResult.passed"
+                          >Tests</span
+                        >
+                      }
                     </expansion-panel-header>
 
                     @if (expansionPanel.opened()) {
@@ -418,6 +476,11 @@ <h4>A11y Violations</h4>
                         </pre>
                       }
 
+                      @if (testsFailed) {
+                        <h4>Failed Tests</h4>
+                        <pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
+                      }
+
                       <h4>Generated Code</h4>
 
                       @for (file of attempt.outputFiles; track file) {
 
@@ -23,6 +23,7 @@ import {
   LlmResponseFile,
   RunInfo,
   RunSummaryBuilds,
+  RunSummaryTests,
   RuntimeStats,
   ScoreBucket,
   SkippedIndividualAssessment,
@@ -264,6 +265,31 @@ export class ReportViewer {
     ];
   }
 
+  protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
+    return [
+      {
+        label: 'Passed',
+        color: ScoreCssVariable.excellent,
+        value: tests.successfulInitialTests,
+      },
+      {
+        label: 'Passed after repair',
+        color: ScoreCssVariable.great,
+        value: tests.successfulTestsAfterRepair,
+      },
+      {
+        label: 'Failed',
+        color: ScoreCssVariable.poor,
+        value: tests.failedTests,
+      },
+      {
+        label: 'No tests run',
+        color: ScoreCssVariable.neutral,
+        value: tests.noTestsRun,
+      },
+    ];
+  }
+
   protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
     return buckets.map(b => ({
       label: b.nameWithLabels,
 
@@ -26,6 +26,12 @@ export const LLM_OUTPUT_DIR = join(rootDir, 'llm-output');
  */
 export const DEFAULT_MAX_REPAIR_ATTEMPTS = 1;
 
+/**
+ * Number of times we'll try to ask LLM to repair a test failure,
+ * providing the test output and the code that causes the problem.
+ */
+export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 1;
+
 /** Name of the folder where we store all generated reports */
 export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
 
 
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
    * Defaults to `<package manager> run start --port 0`.
    */
   serveCommand: z.string().optional(),
+  /**
+   * Command to run when testing the code.
+   */
+  testCommand: z.string().optional(),
   /**
    * Whether to skip installing dependencies when running evals in the environment.
    * Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
   readonly buildCommand: string;
   /** Command to run when starting a development server inside the app. */
   readonly serveCommand: string;
+  /** Command to run when starting tests inside the app. */
+  readonly testCommand: string | null;
   /**
    * Absolute path at which files specific to this environment are located. Will be merged in
    * with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
     this.installCommand = `${packageManager} install --silent`;
     this.buildCommand = config.buildCommand || `${packageManager} run build`;
     this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
+    this.testCommand = config.testCommand ?? null;
     this.projectTemplatePath = projectTemplatePath;
     this.sourceDirectory = sourceDirectory;
     this.mcpServerOptions = config.mcpServers || [];
 
@@ -12,6 +12,7 @@ import {writeResponseFiles} from './file-system.js';
 import {runBuild} from './build-worker.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {EvalID, Gateway} from './gateway.js';
+import {mergeRepairFiles} from './repair.js';
 
 /**
  * Calls the LLM to repair code, handles the response, and attempts to build the project again.
@@ -28,7 +29,6 @@ import {EvalID, Gateway} from './gateway.js';
  * @param abortSignal An AbortSignal to cancel the operation.
  * @param workerConcurrencyQueue The queue for managing worker concurrency.
  * @param attempts The current attempt number.
- * @param repairType The type of repair being performed.
  * @returns A promise that resolves to the new BuildResult.
  */
 export async function repairAndBuild(
@@ -49,7 +49,7 @@ export async function repairAndBuild(
 ): Promise<AttemptDetails> {
   const repairResponse = await repairCodeWithAI(
     evalID,
-    gateway,
+    gateway.repairBuild.bind(gateway),
     model,
     env,
     rootPromptDef,
@@ -132,24 +132,3 @@ async function handleRepairResponse(
     attempt: attempts,
   };
 }
-
-/**
- * Merges a set of new or updated files from a repair attempt into the
- * current set of files.
- * @param repairOutputFiles The array of new or updated files to merge.
- * @param finalFiles The array of files to be updated.
- */
-function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
-  // Merge the repair response into the original files. Otherwise we may end up dropping
-  // files that were valid in the initial response and the LLM decided not to touch, because
-  // they're still valid.
-  for (const file of repairOutputFiles) {
-    const existingFile = finalFiles.find(f => f.filePath === file.filePath);
-
-    if (existingFile) {
-      existingFile.code = file.code;
-    } else {
-      finalFiles.push(file);
-    }
-  }
-}
@@ -2,13 +2,23 @@ import PQueue from 'p-queue';
 import {LlmGenerateFilesResponse} from '../codegen/llm-runner.js';
 import {BuildResultStatus} from '../workers/builder/builder-types.js';
 import {Environment} from '../configuration/environment.js';
-import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-interfaces.js';
-import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
+import {
+  AttemptDetails,
+  LlmContextFile,
+  RootPromptDefinition,
+  TestResult,
+} from '../shared-interfaces.js';
+import {
+  DEFAULT_MAX_REPAIR_ATTEMPTS,
+  DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
+} from '../configuration/constants.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {runBuild} from './build-worker.js';
 import {repairAndBuild} from './build-repair.js';
 import {EvalID, Gateway} from './gateway.js';
 import {serveAndTestApp} from './serve-testing-worker.js';
+import {runTest} from './test-worker.js';
+import {repairAndTest} from './test-repair.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 
 /**
@@ -30,7 +40,7 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
  * @param abortSignal Signal to fire when the build should be aborted.
  * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
  */
-export async function attemptBuild(
+export async function attemptBuildAndTest(
   evalID: EvalID,
   gateway: Gateway<Environment>,
   model: string,
@@ -202,11 +212,65 @@ export async function attemptBuild(
     }
   }
 
+  // Run tests if test command is configured and build was successful
+  let testResult: TestResult | null = null;
+  let testRepairAttempts = 0;
+
+  if (lastAttempt.buildResult.status === BuildResultStatus.SUCCESS && 'testCommand' in env) {
+    testResult = await runTest(
+      evalID,
+      gateway,
+      directory,
+      env,
+      rootPromptDef,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+    );
+
+    const maxTestRepairAttempts = gateway.shouldRetryFailedTests(evalID)
+      ? DEFAULT_MAX_TEST_REPAIR_ATTEMPTS
+      : 0;
+
+    lastAttempt.testResult = testResult;
+
+    while (!testResult.passed && testRepairAttempts < maxTestRepairAttempts) {
+      testRepairAttempts++;
+      progress.log(
+        rootPromptDef,
+        'test',
+        `Trying to repair app tests (attempt #${testRepairAttempts + 1})`,
+      );
+
+      const attempt = await repairAndTest(
+        evalID,
+        gateway,
+        model,
+        env,
+        rootPromptDef,
+        directory,
+        lastAttempt.outputFiles,
+        testResult.output,
+        'The tests failed. Attempt to fix them. There are the following test errors:',
+        contextFiles,
+        abortSignal,
+        workerConcurrencyQueue,
+        testRepairAttempts,
+        progress,
+      );
+      attemptDetails.push(attempt);
+      lastAttempt = attempt;
+      testResult = lastAttempt.testResult!;
+    }
+  }
+
   return {
     buildResult: lastAttempt.buildResult,
     serveTestingResult: lastAttempt.serveTestingResult,
     outputFiles: lastAttempt.outputFiles,
     repairAttempts,
     axeRepairAttempts,
+    testResult,
+    testRepairAttempts,
   };
 }
@@ -88,7 +88,7 @@ export async function generateCodeWithAI(
  */
 export async function repairCodeWithAI(
   evalID: EvalID,
-  gateway: Gateway<Environment>,
+  repairer: Gateway<Environment>['repairBuild'] | Gateway<Environment>['repairTest'],
   model: string,
   env: Environment,
   promptDef: RootPromptDefinition,
@@ -123,7 +123,7 @@ export async function repairCodeWithAI(
 
   progress.log(promptDef, 'codegen', 'Repairing code with AI');
 
-  const response = await gateway.repairBuild(
+  const response = await repairer(
     evalID,
     context,
     model,