diff --git a/docs/environment-reference.md b/docs/environment-reference.md
index 3b3f7ba..c37fbea 100644
--- a/docs/environment-reference.md
+++ b/docs/environment-reference.md
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
 
 Command used to start a local dev server as a part of the evaluation.
 Defaults to `<package manager> run start --port 0`.
+
+### `testCommand`
+
+Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.
+
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html
index 274842c..0a74fd7 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.html
+++ b/report-app/src/app/pages/report-viewer/report-viewer.html
@@ -73,6 +73,20 @@ <h3 class="chart-title">
             <stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
           </div>
         </div>
+        @if (overview.stats.tests) {
+          <div class="chart-container test-results-details">
+            <h3 class="chart-title">
+              <span class="material-symbols-outlined"> quiz </span>
+              <span>Tests</span>
+            </h3>
+            <div class="summary-card-item">
+              <stacked-bar-chart
+                [data]="testsAsGraphData(overview.stats.tests)"
+                [compact]="true"
+              />
+            </div>
+          </div>
+        }
         @if (overview.stats.runtime) {
           <div class="chart-container">
             <h3 class="chart-title">
@@ -281,9 +295,19 @@ <h2>Generated applications</h2>
                   <span class="status-badge error">Initial build failed</span>
                 }
 
-                @if (hasBuildFailureDuringA11yRepair(result)) {
+                @if (hasBuildFailureDuringTestRepair(result)) {
                   <span class="status-badge error">Build failed after a11y repair</span>
                 }
+                <!-- Test status badges -->
+                @if (finalAttempt.testResult) {
+                  @if (finalAttempt.testResult.passed) {
+                    @if ((result.testRepairAttempts || 0) > 0) {
+                      <span class="status-badge warning">Tests passed after repair</span>
+                    }
+                  } @else {
+                    <span class="status-badge error">Tests failed</span>
+                  }
+                }
               </div>
             </div>
           </expansion-panel-header>
@@ -355,12 +379,36 @@ <h5>
                 </div>
               </div>
 
+              @if (result.testResult) {
+                <div class="app-details-section">
+                  <h4>Test Results</h4>
+                  <div class="test-summary">
+                    @if (result.testResult.passed) {
+                      <span class="status-text success">✔ Tests passed</span>
+                      @if ((result.testRepairAttempts || 0) > 0) {
+                        <span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
+                      }
+                    } @else {
+                      <span class="status-text error">✘ Tests failed</span>
+                    }
+                  </div>
+                  
+                  @if (result.testResult.output && !result.testResult.passed) {
+                    <details class="test-output-button">
+                      <summary class="neutral-button">See Test Output</summary>
+                      <pre class="callout neutral code">{{ result.testResult.output }}</pre>
+                    </details>
+                  }
+                </div>
+              }
+
               <div class="app-details-section">
                 <h4>Additional info</h4>
                 @for (attempt of result.attemptDetails; track attempt) {
                   @let isBuilt = attempt.buildResult.status === 'success';
                   @let axeViolations = attempt.serveTestingResult?.axeViolations;
                   @let hasAxeViolations = axeViolations && axeViolations.length > 0;
+                  @let testsFailed = attempt.testResult?.passed === false;
 
                   <expansion-panel #expansionPanel>
                     <expansion-panel-header>
@@ -385,6 +433,15 @@ <h4>Additional info</h4>
                           >A11y</span
                         >
                       }
+
+                      @if (attempt.testResult) {
+                        <span
+                          class="status-badge"
+                          [class.error]="!attempt.testResult.passed"
+                          [class.success]="attempt.testResult.passed"
+                          >Tests</span
+                        >
+                      }
                     </expansion-panel-header>
 
                     @if (expansionPanel.opened()) {
@@ -421,6 +478,11 @@ <h4>A11y Violations</h4>
                         </pre>
                       }
 
+                      @if (testsFailed) {
+                        <h4>Failed Tests</h4>
+                        <pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
+                      }
+
                       <h4>Generated Code</h4>
 
                       @for (file of attempt.outputFiles; track file) {
diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts
index 8e02168..78a9085 100644
--- a/report-app/src/app/pages/report-viewer/report-viewer.ts
+++ b/report-app/src/app/pages/report-viewer/report-viewer.ts
@@ -25,6 +25,7 @@ import {
   LlmResponseFile,
   RunInfo,
   RunSummaryBuilds,
+  RunSummaryTests,
   RuntimeStats,
   ScoreBucket,
   SkippedIndividualAssessment,
@@ -271,6 +272,31 @@ export class ReportViewer {
     ];
   }
 
+  protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
+    return [
+      {
+        label: 'Passed',
+        color: ScoreCssVariable.excellent,
+        value: tests.successfulInitialTests,
+      },
+      {
+        label: 'Passed after repair',
+        color: ScoreCssVariable.great,
+        value: tests.successfulTestsAfterRepair,
+      },
+      {
+        label: 'Failed',
+        color: ScoreCssVariable.poor,
+        value: tests.failedTests,
+      },
+      {
+        label: 'No tests run',
+        color: ScoreCssVariable.neutral,
+        value: tests.noTestsRun,
+      },
+    ];
+  }
+
   protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
     return buckets.map(b => ({
       label: b.nameWithLabels,
@@ -427,7 +453,7 @@ export class ReportViewer {
     return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
   }
 
-  protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
-    return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
+  protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
+    return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
   }
 }
diff --git a/runner/configuration/base-environment-config.ts b/runner/configuration/base-environment-config.ts
index fe311ae..4aa3e40 100644
--- a/runner/configuration/base-environment-config.ts
+++ b/runner/configuration/base-environment-config.ts
@@ -1,8 +1,6 @@
 import z from 'zod';
 import {ratingSchema} from '../ratings/rating-types.js';
 import {MultiStepPrompt} from './multi-step-prompt.js';
-import {mcpServerOptionsSchema} from '../codegen/llm-runner.js';
-import {getPossiblePackageManagers} from './environment-config.js';
 
 export const baseEnvironmentConfigSchema = z.strictObject({
   /** Display name for the environment. */
diff --git a/runner/configuration/environment-config.ts b/runner/configuration/environment-config.ts
index e15361e..77bdf28 100644
--- a/runner/configuration/environment-config.ts
+++ b/runner/configuration/environment-config.ts
@@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([
  */
 export type EnvironmentConfig = z.infer<typeof environmentConfigSchema>;
 
-/** Package managers that are currently supported. */
-export function getPossiblePackageManagers() {
-  return ['npm', 'pnpm', 'yarn'] as const;
-}
-
 /** Asserts that the specified data is a valid environment config. */
 export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
   const validationResult = environmentConfigSchema.safeParse(value);
diff --git a/runner/configuration/environment-local.ts b/runner/configuration/environment-local.ts
index 9eefed2..041f9a8 100644
--- a/runner/configuration/environment-local.ts
+++ b/runner/configuration/environment-local.ts
@@ -3,7 +3,7 @@ import z from 'zod';
 import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js';
 import {LocalGateway} from '../orchestration/gateways/local_gateway.js';
 import {BaseEnvironment} from './base-environment.js';
-import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js';
+import {getPossiblePackageManagers} from './package-managers.js';
 import {baseEnvironmentConfigSchema} from './base-environment-config.js';
 
 export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
    * Defaults to `<package manager> run start --port 0`.
    */
   serveCommand: z.string().optional(),
+  /**
+   * Command to run when testing the code.
+   */
+  testCommand: z.string().optional(),
   /**
    * Whether to skip installing dependencies when running evals in the environment.
    * Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
   readonly buildCommand: string;
   /** Command to run when starting a development server inside the app. */
   readonly serveCommand: string;
+  /** Command to run when starting tests inside the app. */
+  readonly testCommand: string | null;
   /**
    * Absolute path at which files specific to this environment are located. Will be merged in
    * with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
     this.installCommand = `${packageManager} install --silent`;
     this.buildCommand = config.buildCommand || `${packageManager} run build`;
     this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
+    this.testCommand = config.testCommand ?? null;
     this.projectTemplatePath = projectTemplatePath;
     this.sourceDirectory = sourceDirectory;
     this.mcpServerOptions = config.mcpServers || [];
diff --git a/runner/configuration/package-managers.ts b/runner/configuration/package-managers.ts
new file mode 100644
index 0000000..6929cd2
--- /dev/null
+++ b/runner/configuration/package-managers.ts
@@ -0,0 +1,4 @@
+/** Package managers that are currently supported. */
+export function getPossiblePackageManagers() {
+  return ['npm', 'pnpm', 'yarn'] as const;
+}
diff --git a/runner/eval-cli.ts b/runner/eval-cli.ts
index 8734583..d6eb23c 100644
--- a/runner/eval-cli.ts
+++ b/runner/eval-cli.ts
@@ -36,7 +36,7 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
-  a11yRepairAttempts?: number;
+  testRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -148,10 +148,11 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
-      .option('a11y-repair-attempts', {
+      .option('test-repair-attempts', {
         type: 'number',
         default: 0,
-        description: 'Number of repair attempts for discovered a11y violations',
+        description:
+          'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
       })
       .strict()
       .version(false)
@@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
       skipAiSummary: cliArgs.skipAiSummary,
-      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
+      testRepairAttempts: cliArgs.testRepairAttempts,
     });
 
     logReportToConsole(runInfo);
diff --git a/runner/orchestration/build-serve-loop.ts b/runner/orchestration/build-serve-loop.ts
index 9bbd849..7f7d1b5 100644
--- a/runner/orchestration/build-serve-loop.ts
+++ b/runner/orchestration/build-serve-loop.ts
@@ -6,9 +6,10 @@ import {AttemptDetails, LlmContextFile, RootPromptDefinition} from '../shared-in
 import {DEFAULT_MAX_REPAIR_ATTEMPTS} from '../configuration/constants.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {runBuild} from './build-worker.js';
-import {repairAndBuild} from './build-repair.js';
+import {repairAndBuild} from './repair.js';
 import {EvalID, Gateway} from './gateway.js';
 import {serveAndTestApp} from './serve-testing-worker.js';
+import {runTest} from './test-worker.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 
 /**
@@ -30,7 +31,7 @@ import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
  * @param abortSignal Signal to fire when the build should be aborted.
  * @param workerConcurrencyQueue Concurrency queue for controlling parallelism of worker invocations (as they are more expensive than LLM calls).
  */
-export async function attemptBuild(
+export async function attemptBuildAndTest(
   evalID: EvalID,
   gateway: Gateway<Environment>,
   model: string,
@@ -47,7 +48,7 @@ export async function attemptBuild(
   skipAxeTesting: boolean,
   enableAutoCsp: boolean,
   userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
-  maxAxeRepairAttempts: number,
+  maxTestRepairAttempts: number,
 ) {
   const initialBuildResult = await runBuild(
     evalID,
@@ -97,13 +98,18 @@ export async function attemptBuild(
       rootPromptDef,
       directory,
       lastAttempt.outputFiles,
-      lastAttempt.buildResult.message,
-      'There are the following build errors:',
+      [
+        {
+          errorContext: 'There are the following build errors:',
+          errorMessage: lastAttempt.buildResult.message,
+        },
+      ],
       contextFiles,
       abortSignal,
       workerConcurrencyQueue,
       repairAttempts,
       progress,
+      'build',
     );
 
     attemptDetails.push(attempt);
@@ -127,31 +133,70 @@ export async function attemptBuild(
       enableAutoCsp,
       userJourneyAgentTaskInput,
     );
+    const testResult = await runTest(
+      evalID,
+      gateway,
+      directory,
+      env,
+      rootPromptDef,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+    );
+
+    if (testResult !== null) {
+      lastAttempt.testResult = testResult;
+    }
   }
 
-  // Attempt to repair axe testing. This only runs when the last build
-  // passed and serving did run. Note: By default, we don't run axe repair
+  // Attempt to repair testing. This only runs when the last build
+  // passed and serving did run. Note: By default, we don't run repair
   // attempts as it's not commonly done by LLMs in the ecosystem.
   let axeRepairAttempts = 0;
-  while (
-    lastAttempt.serveTestingResult &&
-    (lastAttempt.serveTestingResult.axeViolations?.length ?? 0) > 0 &&
-    axeRepairAttempts < maxAxeRepairAttempts
-  ) {
-    axeRepairAttempts++;
-    progress.log(
-      rootPromptDef,
-      'build',
-      `Trying to repair axe accessibility violations (attempt #${axeRepairAttempts + 1})...`,
-    );
+  let testRepairAttempts = 0;
+  for (let testRepairAttempt = 0; testRepairAttempt < maxTestRepairAttempts; testRepairAttempt++) {
+    const hasAxeFailure =
+      lastAttempt.serveTestingResult && lastAttempt.serveTestingResult.axeViolations?.length;
+    const hasTestFailure = lastAttempt.testResult && !lastAttempt.testResult.passed;
+    if (!hasAxeFailure && !hasTestFailure) {
+      break;
+    }
 
-    const axeViolationsError = JSON.stringify(
-      lastAttempt.serveTestingResult.axeViolations,
-      null,
-      2,
-    );
+    const attemptId = testRepairAttempt + repairAttempts + 1;
 
-    progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations');
+    const errors: Array<{errorContext: string; errorMessage: string}> = [];
+    if (hasAxeFailure) {
+      axeRepairAttempts++;
+      progress.log(
+        rootPromptDef,
+        'build',
+        `Trying to repair axe accessibility violations (attempt #${attemptId})...`,
+      );
+      const axeViolationsError = JSON.stringify(
+        lastAttempt.serveTestingResult!.axeViolations,
+        null,
+        2,
+      );
+      progress.log(rootPromptDef, 'error', 'Found Axe accessibility violations');
+      errors.push({
+        errorContext:
+          'There are the following accessibility errors from axe accessibility violations:',
+        errorMessage: axeViolationsError,
+      });
+    }
+    if (hasTestFailure) {
+      testRepairAttempts++;
+      progress.log(
+        rootPromptDef,
+        'test',
+        `Trying to repair test failures (attempt #${attemptId})...`,
+      );
+
+      errors.push({
+        errorContext: 'Application tests failed. Attempt to fix them. Test output was:',
+        errorMessage: lastAttempt.testResult!.output,
+      });
+    }
 
     const attempt = await repairAndBuild(
       evalID,
@@ -161,28 +206,28 @@ export async function attemptBuild(
       rootPromptDef,
       directory,
       lastAttempt.outputFiles,
-      axeViolationsError,
-      'There are the following accessibility errors from axe accessibility violations:',
+      errors,
       contextFiles,
       abortSignal,
       workerConcurrencyQueue,
-      axeRepairAttempts + repairAttempts,
+      attemptId,
       progress,
+      'test',
     );
 
     let hasBuildFailure = attempt.buildResult.status !== BuildResultStatus.SUCCESS;
-    attempt.buildFailedDuringA11yRepair = hasBuildFailure;
+    attempt.buildFailedDuringTestRepair = hasBuildFailure;
     attemptDetails.push(attempt);
     lastAttempt = attempt;
+    // If we somehow introduced build errors via the repair loop, we abort
+    // further repairs and capture the failed build. This is useful insight
+    // as LLMs seem to regress when asked to repair violations.
+    if (hasBuildFailure) {
+      break;
+    }
 
-    // If we somehow introduced build errors via the Axe repair loop, we abort
-    // further a11y repairs and capture the failed build. This is useful insight
-    // as LLMs seem to regress when asked to repair a11y violations.
-    if (hasBuildFailure) break;
-
-    // Re-run serving & tests after Axe repair.
-    // This allows us to check if we fixed the violations.
-    attempt.serveTestingResult = await serveAndTestApp(
+    // Re-run serving & tests after repair.
+    lastAttempt.serveTestingResult = await serveAndTestApp(
       evalID,
       gateway,
       directory,
@@ -196,10 +241,27 @@ export async function attemptBuild(
       enableAutoCsp,
       userJourneyAgentTaskInput,
     );
+    const testResult = await runTest(
+      evalID,
+      gateway,
+      directory,
+      env,
+      rootPromptDef,
+      abortSignal,
+      workerConcurrencyQueue,
+      progress,
+    );
+
+    if (testResult !== null) {
+      lastAttempt.testResult = testResult;
+    }
 
-    if (attempt.serveTestingResult.axeViolations?.length === 0) {
+    if (hasAxeFailure && lastAttempt.serveTestingResult.axeViolations?.length === 0) {
       progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
     }
+    if (hasTestFailure && lastAttempt.testResult?.passed) {
+      progress.log(rootPromptDef, 'success', `Successfully fixed all test failures`);
+    }
   }
 
   return {
@@ -207,6 +269,8 @@ export async function attemptBuild(
     serveTestingResult: lastAttempt.serveTestingResult,
     outputFiles: lastAttempt.outputFiles,
     repairAttempts,
-    axeRepairAttempts,
+    axeRepairAttempts: axeRepairAttempts,
+    testResult: lastAttempt.testResult,
+    testRepairAttempts: testRepairAttempts,
   };
 }
diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts
index 0ff1097..7cfed2b 100644
--- a/runner/orchestration/codegen.ts
+++ b/runner/orchestration/codegen.ts
@@ -8,10 +8,10 @@ import {
 } from '../shared-interfaces.js';
 import {LlmGenerateFilesContext, LlmRunner, PromptDataMessage} from '../codegen/llm-runner.js';
 import {Environment} from '../configuration/environment.js';
-import {getPossiblePackageManagers} from '../configuration/environment-config.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {EvalID, Gateway} from './gateway.js';
 import {LocalEnvironment} from '../configuration/environment-local.js';
+import {getPossiblePackageManagers} from '../configuration/package-managers.js';
 
 /**
  * Generates code using the configured AI model based on the provided prompt.
@@ -94,18 +94,17 @@ export async function repairCodeWithAI(
   promptDef: RootPromptDefinition,
   directory: string,
   appFiles: LlmResponseFile[],
-  errorMessage: string,
-  errorContext: string,
+  errors: Array<{errorContext: string; errorMessage: string}>,
   contextFiles: LlmContextFile[],
   abortSignal: AbortSignal,
   progress: ProgressLogger,
+  repairType: 'build' | 'test',
 ): Promise<LlmResponse> {
   const repairSystemInstructions = env.systemPromptRepair();
   const repairPrompt = [
-    errorContext,
-    '```',
-    errorMessage,
-    '```',
+    ...errors.map(({errorContext, errorMessage}) =>
+      [errorContext, '```', errorMessage, '```'].join('\n'),
+    ),
     '',
     'In the following source code:',
     ...appFiles.map(file => `${file.filePath}:\n\`\`\`\n${file.code}\`\`\`\n\n`),
@@ -121,13 +120,13 @@ export async function repairCodeWithAI(
     possiblePackageManagers: getPossiblePackageManagers().slice(),
   };
 
-  progress.log(promptDef, 'codegen', 'Repairing code with AI');
+  progress.log(promptDef, 'codegen', `Repairing ${repairType} failures with AI`);
 
-  const response = await gateway.repairBuild(
+  const response = await gateway.repairCode(
     evalID,
     context,
     model,
-    errorMessage,
+    errors.map(ec => ec.errorMessage).join('\n'),
     appFiles,
     contextFiles,
     abortSignal,
diff --git a/runner/orchestration/gateway.ts b/runner/orchestration/gateway.ts
index 7e2bf01..f761366 100644
--- a/runner/orchestration/gateway.ts
+++ b/runner/orchestration/gateway.ts
@@ -7,6 +7,7 @@ import {
   LlmResponse,
   LlmResponseFile,
   RootPromptDefinition,
+  TestExecutionResult,
 } from '../shared-interfaces.js';
 import {BuildResult} from '../workers/builder/builder-types.js';
 
@@ -25,7 +26,7 @@ export interface Gateway<Env extends Environment> {
     abortSignal: AbortSignal,
   ): Promise<LlmResponse>;
 
-  repairBuild(
+  repairCode(
     id: EvalID,
     requestCtx: LlmGenerateFilesContext,
     model: string,
@@ -47,6 +48,18 @@ export interface Gateway<Env extends Environment> {
     progress: ProgressLogger,
   ): Promise<BuildResult>;
 
+  tryTest(
+    id: EvalID,
+    env: Env,
+    appDirectoryPath: string,
+    rootPromptDef: RootPromptDefinition,
+    workerConcurrencyQueue: PQueue,
+    abortSignal: AbortSignal,
+    progress: ProgressLogger,
+  ): Promise<TestExecutionResult | null>;
+
+  shouldRetryFailedTestExecution(evalID: EvalID): boolean;
+
   serveBuild<T>(
     id: EvalID,
     env: Env,
diff --git a/runner/orchestration/gateways/local_gateway.ts b/runner/orchestration/gateways/local_gateway.ts
index 75cd970..69cbcc8 100644
--- a/runner/orchestration/gateways/local_gateway.ts
+++ b/runner/orchestration/gateways/local_gateway.ts
@@ -10,6 +10,7 @@ import {
   LlmContextFile,
   LlmResponse,
   LlmResponseFile,
+  TestExecutionResult,
 } from '../../shared-interfaces.js';
 import {generateCodeWithAI} from '../codegen.js';
 import {EvalID, Gateway} from '../gateway.js';
@@ -19,6 +20,9 @@ import {ProgressLogger} from '../../progress/progress-logger.js';
 import {serveApp} from '../../workers/serve-testing/serve-app.js';
 import {LocalEnvironment} from '../../configuration/environment-local.js';
 import PQueue from 'p-queue';
+import {executeCommand} from '../../utils/exec.js';
+import {callWithTimeout} from '../../utils/timeout.js';
+import {cleanupBuildMessage} from '../../workers/builder/worker.js';
 
 let uniqueIDs = 0;
 
@@ -39,7 +43,7 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     return await generateCodeWithAI(this.llm, model, requestCtx, contextFiles, abortSignal);
   }
 
-  async repairBuild(
+  async repairCode(
     _id: EvalID,
     requestCtx: LlmGenerateFilesContext,
     model: string,
@@ -88,6 +92,45 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     );
   }
 
+  async tryTest(
+    _id: EvalID,
+    env: LocalEnvironment,
+    appDirectoryPath: string,
+    rootPromptDef: RootPromptDefinition,
+    workerConcurrencyQueue: PQueue,
+    abortSignal: AbortSignal,
+    progress: ProgressLogger,
+  ): Promise<TestExecutionResult | null> {
+    if (!env.testCommand) {
+      return Promise.resolve(null);
+    }
+
+    let output: string;
+    let passed: boolean;
+
+    try {
+      // Run the test command inside the temporary project directory
+      const stdout = await callWithTimeout(
+        `Testing ${rootPromptDef.name}`,
+        timeoutAbort =>
+          executeCommand(env.testCommand!, appDirectoryPath, undefined, {
+            abortSignal: AbortSignal.any([abortSignal, timeoutAbort]),
+          }),
+        4, // 4min. This is a safety boundary. Lots of parallelism can slow-down.
+      );
+      output = stdout;
+      passed = true;
+    } catch (error: any) {
+      output = error.message;
+      passed = false;
+    }
+
+    return {
+      passed,
+      output: cleanupBuildMessage(output),
+    } satisfies TestExecutionResult;
+  }
+
   async serveBuild<T>(
     _id: EvalID,
     env: LocalEnvironment,
@@ -109,5 +152,9 @@ export class LocalGateway implements Gateway<LocalEnvironment> {
     return this.llm.hasBuiltInRepairLoop === false;
   }
 
+  shouldRetryFailedTestExecution(): boolean {
+    return this.llm.hasBuiltInRepairLoop === false;
+  }
+
   async finalizeEval(_id: EvalID): Promise<void> {}
 }
diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts
index 95b12c4..45d7ce8 100644
--- a/runner/orchestration/generate.ts
+++ b/runner/orchestration/generate.ts
@@ -29,17 +29,17 @@ import {
   RunDetails,
   RunInfo,
   RunSummary,
+  TestExecutionResult,
   Usage,
 } from '../shared-interfaces.js';
 import {BrowserAgentTaskInput} from '../testing/browser-agent/models.js';
 import {callWithTimeout} from '../utils/timeout.js';
-import {attemptBuild} from './build-serve-loop.js';
+import {attemptBuildAndTest} from './build-serve-loop.js';
 import {createLlmResponseTokenUsageMessage} from './codegen.js';
 import {generateUserJourneysForApp} from './user-journeys.js';
 import {resolveContextFiles, setupProjectStructure, writeResponseFiles} from './file-system.js';
 import {GenkitRunner} from '../codegen/genkit/genkit-runner.js';
 import {getEnvironmentByPath} from '../configuration/environment-resolution.js';
-import {getPossiblePackageManagers} from '../configuration/environment-config.js';
 import {ProgressLogger} from '../progress/progress-logger.js';
 import {TextProgressLogger} from '../progress/text-progress-logger.js';
 import {logReportHeader} from '../reporting/report-logging.js';
@@ -51,6 +51,7 @@ import {EvalID, Gateway} from './gateway.js';
 import {LocalEnvironment} from '../configuration/environment-local.js';
 import {getRunnerByName, RunnerName} from '../codegen/runner-creation.js';
 import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
+import {getPossiblePackageManagers} from '../configuration/package-managers.js';
 
 /**
  * Orchestrates the entire assessment process for each prompt defined in the `prompts` array.
@@ -59,7 +60,8 @@ import {summarizeReportWithAI} from '../reporting/report-ai-summary.js';
  * 1. Makes a request to Gemini to generate code.
  * 2. Attempts to build it in a template Angular project.
  * 3. If the build fails, it makes a number of "fix it" Gemini requests.
- * 4. Runs other validations and computes a score for generated output.
+ * 4. If configured, runs unit tests and attempts to repair test failures.
+ * 5. Runs other validations and computes a score for generated output.
  *
  * @returns A Promise that resolves to an array of AssessmentResult objects,
  *          each containing the prompt, generated code, and final validation status.
@@ -84,7 +86,7 @@ export async function generateCodeAndAssess(options: {
   enableAutoCsp?: boolean;
   logging?: 'text-only' | 'dynamic';
   autoraterModel?: string;
-  a11yRepairAttempts?: number;
+  testRepairAttempts?: number;
 }): Promise<RunInfo> {
   const env = await getEnvironmentByPath(options.environmentConfigPath, options.runner);
   const ratingLlm = await getRunnerByName('genkit');
@@ -179,7 +181,7 @@ export async function generateCodeAndAssess(options: {
                     workerConcurrencyQueue,
                     progress,
                     options.autoraterModel || DEFAULT_AUTORATER_MODEL_NAME,
-                    options.a11yRepairAttempts ?? 0,
+                    options.testRepairAttempts ?? 0,
                   ),
                 // 10min max per app evaluation.  We just want to make sure it never gets stuck.
                 10,
@@ -309,7 +311,7 @@ async function startEvaluationTask(
   workerConcurrencyQueue: PQueue,
   progress: ProgressLogger,
   autoraterModel: string,
-  a11yRepairAttempts: number,
+  testRepairAttempts: number,
 ): Promise<AssessmentResult[]> {
   // Set up the project structure once for the root project.
   const {directory, cleanup} = await setupProjectStructure(
@@ -417,7 +419,7 @@ async function startEvaluationTask(
 
     // Try to build the files in the root prompt directory.
     // This will also attempt to fix issues with the generated code.
-    const attempt = await attemptBuild(
+    const attempt = await attemptBuildAndTest(
       evalID,
       gateway,
       model,
@@ -434,7 +436,7 @@ async function startEvaluationTask(
       skipAxeTesting,
       enableAutoCsp,
       userJourneyAgentTaskInput,
-      a11yRepairAttempts,
+      testRepairAttempts,
     );
 
     if (!attempt) {
@@ -455,6 +457,8 @@ async function startEvaluationTask(
       abortSignal,
       progress,
       autoraterModel,
+      attempt.testResult ?? null,
+      attempt.testRepairAttempts,
     );
 
     results.push({
@@ -472,6 +476,8 @@ async function startEvaluationTask(
       userJourneys: userJourneys,
       axeRepairAttempts: attempt.axeRepairAttempts,
       toolLogs,
+      testResult: attempt.testResult ?? null,
+      testRepairAttempts: attempt.testRepairAttempts,
     } satisfies AssessmentResult);
   }
 
diff --git a/runner/orchestration/build-repair.ts b/runner/orchestration/repair.ts
similarity index 95%
rename from runner/orchestration/build-repair.ts
rename to runner/orchestration/repair.ts
index b275fa1..8f6e736 100644
--- a/runner/orchestration/build-repair.ts
+++ b/runner/orchestration/repair.ts
@@ -1,4 +1,6 @@
+import {Environment} from '../configuration/environment.js';
 import PQueue from 'p-queue';
+import {ProgressLogger} from '../progress/progress-logger.js';
 import {
   AttemptDetails,
   LlmContextFile,
@@ -6,12 +8,10 @@ import {
   LlmResponseFile,
   RootPromptDefinition,
 } from '../shared-interfaces.js';
-import {Environment} from '../configuration/environment.js';
-import {repairCodeWithAI} from './codegen.js';
-import {writeResponseFiles} from './file-system.js';
 import {runBuild} from './build-worker.js';
-import {ProgressLogger} from '../progress/progress-logger.js';
+import {writeResponseFiles} from './file-system.js';
 import {EvalID, Gateway} from './gateway.js';
+import {repairCodeWithAI} from './codegen.js';
 
 /**
  * Calls the LLM to repair code, handles the response, and attempts to build the project again.
@@ -23,12 +23,11 @@ import {EvalID, Gateway} from './gateway.js';
  * @param directory The working directory.
  * @param finalOutputFiles The list of output files to be modified.
  * @param errorMessage The error message from the failed build.
- * @param errorContext Additional context for the error.
+ * @param errors Additional context for the error.
  * @param contextFiles A list of context files for the LLM.
  * @param abortSignal An AbortSignal to cancel the operation.
  * @param workerConcurrencyQueue The queue for managing worker concurrency.
  * @param attempts The current attempt number.
- * @param repairType The type of repair being performed.
  * @returns A promise that resolves to the new BuildResult.
  */
 export async function repairAndBuild(
@@ -39,13 +38,13 @@ export async function repairAndBuild(
   rootPromptDef: RootPromptDefinition,
   directory: string,
   previousAttemptFiles: LlmResponseFile[],
-  errorMessage: string,
-  errorContext: string,
+  errors: Array<{errorContext: string; errorMessage: string}>,
   contextFiles: LlmContextFile[],
   abortSignal: AbortSignal,
   workerConcurrencyQueue: PQueue,
   attempts: number,
   progress: ProgressLogger,
+  repairType: 'build' | 'test',
 ): Promise<AttemptDetails> {
   const repairResponse = await repairCodeWithAI(
     evalID,
@@ -55,11 +54,11 @@ export async function repairAndBuild(
     rootPromptDef,
     directory,
     previousAttemptFiles,
-    errorMessage,
-    errorContext,
+    errors,
     contextFiles,
     abortSignal,
     progress,
+    repairType,
   );
 
   return await handleRepairResponse(
@@ -77,6 +76,27 @@ export async function repairAndBuild(
   );
 }
 
+/**
+ * Merges a set of new or updated files from a repair attempt into the
+ * current set of files.
+ * @param repairOutputFiles The array of new or updated files to merge.
+ * @param finalFiles The array of files to be updated.
+ */
+function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
+  // Merge the repair response into the original files. Otherwise we may end up dropping
+  // files that were valid in the initial response and the LLM decided not to touch, because
+  // they're still valid.
+  for (const file of repairOutputFiles) {
+    const existingFile = finalFiles.find(f => f.filePath === file.filePath);
+
+    if (existingFile) {
+      existingFile.code = file.code;
+    } else {
+      finalFiles.push(file);
+    }
+  }
+}
+
 /**
  * Processes an LLM repair response by merging the suggested file changes,
  * writing them to disk, rebuilding the application, and logging the outcome.
@@ -93,7 +113,7 @@ async function handleRepairResponse(
   abortSignal: AbortSignal,
   attempts: number,
   progress: ProgressLogger,
-) {
+): Promise<AttemptDetails> {
   if (!repairResponse.success) {
     progress.log(
       rootPromptDef,
@@ -104,7 +124,6 @@ async function handleRepairResponse(
     // Stop trying to repair if AI can't suggest a fix (API request fails)
     throw new Error(`Repair request failed: ${repairResponse.errors.join('\n')}`);
   }
-
   // Clone the previous files because `mergeRepairFiles` mutates the attempt files.
   // We don't want to change files of a previous attempt.
   const newAttemptFiles = previousAttemptFiles.map(f => ({...f}));
@@ -132,24 +151,3 @@ async function handleRepairResponse(
     attempt: attempts,
   };
 }
-
-/**
- * Merges a set of new or updated files from a repair attempt into the
- * current set of files.
- * @param repairOutputFiles The array of new or updated files to merge.
- * @param finalFiles The array of files to be updated.
- */
-function mergeRepairFiles(repairOutputFiles: LlmResponseFile[], finalFiles: LlmResponseFile[]) {
-  // Merge the repair response into the original files. Otherwise we may end up dropping
-  // files that were valid in the initial response and the LLM decided not to touch, because
-  // they're still valid.
-  for (const file of repairOutputFiles) {
-    const existingFile = finalFiles.find(f => f.filePath === file.filePath);
-
-    if (existingFile) {
-      existingFile.code = file.code;
-    } else {
-      finalFiles.push(file);
-    }
-  }
-}
diff --git a/runner/orchestration/test-worker.ts b/runner/orchestration/test-worker.ts
new file mode 100644
index 0000000..53b4fe6
--- /dev/null
+++ b/runner/orchestration/test-worker.ts
@@ -0,0 +1,44 @@
+import PQueue from 'p-queue';
+import {RootPromptDefinition, TestExecutionResult} from '../shared-interfaces.js';
+import {ProgressLogger} from '../progress/progress-logger.js';
+import {EvalID, Gateway} from './gateway.js';
+import {Environment} from '../configuration/environment.js';
+
+export async function runTest(
+  evalID: EvalID,
+  gateway: Gateway<Environment>,
+  appDirectoryPath: string,
+  env: Environment,
+  rootPromptDef: RootPromptDefinition,
+  abortSignal: AbortSignal,
+  workerConcurrencyQueue: PQueue,
+  progress: ProgressLogger,
+): Promise<TestExecutionResult | null> {
+  progress.log(rootPromptDef, 'test', `Running tests`);
+
+  try {
+    const result = await gateway.tryTest(
+      evalID,
+      env,
+      appDirectoryPath,
+      rootPromptDef,
+      workerConcurrencyQueue,
+      abortSignal,
+      progress,
+    );
+    if (result === null) {
+      return result;
+    }
+
+    if (result.passed) {
+      progress.log(rootPromptDef, 'success', 'Tests have passed');
+    } else {
+      progress.log(rootPromptDef, 'error', 'Tests have failed');
+    }
+
+    return result;
+  } catch (err) {
+    progress.log(rootPromptDef, 'error', `Error when executing tests`, err + '');
+    throw err;
+  }
+}
diff --git a/runner/progress/dynamic-progress-logger.ts b/runner/progress/dynamic-progress-logger.ts
index 949cf96..0e68632 100644
--- a/runner/progress/dynamic-progress-logger.ts
+++ b/runner/progress/dynamic-progress-logger.ts
@@ -148,6 +148,7 @@ export class DynamicProgressLogger implements ProgressLogger {
     switch (type) {
       case 'success':
       case 'serve-testing':
+      case 'test':
       case 'build':
         return chalk.green;
       case 'error':
diff --git a/runner/progress/progress-logger.ts b/runner/progress/progress-logger.ts
index c888aba..b029aa6 100644
--- a/runner/progress/progress-logger.ts
+++ b/runner/progress/progress-logger.ts
@@ -2,7 +2,14 @@ import {greenCheckmark, redX} from '../reporting/format.js';
 import {AssessmentResult, RootPromptDefinition} from '../shared-interfaces.js';
 
 /** Possible progress event types. */
-export type ProgressType = 'codegen' | 'build' | 'serve-testing' | 'success' | 'error' | 'eval';
+export type ProgressType =
+  | 'codegen'
+  | 'build'
+  | 'test'
+  | 'serve-testing'
+  | 'success'
+  | 'error'
+  | 'eval';
 
 /** Maps a ProgressType to an icon that can represent it. */
 export function progressTypeToIcon(type: ProgressType): string {
@@ -12,6 +19,8 @@ export function progressTypeToIcon(type: ProgressType): string {
       return '🤖';
     case 'build':
       return '🔨';
+    case 'test':
+      return '🧪';
     case 'serve-testing':
       return '🌊';
     case 'success':
diff --git a/runner/ratings/built-in-ratings/successful-tests-rating.ts b/runner/ratings/built-in-ratings/successful-tests-rating.ts
new file mode 100644
index 0000000..2941fd3
--- /dev/null
+++ b/runner/ratings/built-in-ratings/successful-tests-rating.ts
@@ -0,0 +1,28 @@
+import {PerBuildRating, RatingKind, RatingCategory, RatingState} from '../rating-types.js';
+
+/** Rating which verifies that unit tests pass successfully. */
+export const successfulTestsRating: PerBuildRating = {
+  name: 'Tests pass successfully',
+  description: 'Ensures tests run and pass without errors.',
+  id: 'common-successful-tests',
+  kind: RatingKind.PER_BUILD,
+  category: RatingCategory.MEDIUM_IMPACT,
+  scoreReduction: '30%',
+  // Reduce the amount of points in case we've had test repair attempts.
+  rate: ({testResult, testRepairAttempts}) => {
+    // If no test results are available, skip this rating
+    if (!testResult) {
+      return {
+        state: RatingState.SKIPPED,
+        message: 'Unit testing not configured.',
+      };
+    }
+
+    return {
+      state: RatingState.EXECUTED,
+      coefficient: testResult.passed
+        ? 1 / ((testRepairAttempts || 0) + 1) // Reduce score based on repair attempts
+        : 0, // No points if tests failed
+    };
+  },
+};
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
index 99d0874..c0500ec 100644
--- a/runner/ratings/rate-code.ts
+++ b/runner/ratings/rate-code.ts
@@ -8,6 +8,7 @@ import {
   IndividualAssessmentState,
   PromptDefinition,
   AssessmentCategory,
+  TestExecutionResult,
 } from '../shared-interfaces.js';
 import {
   RatingState,
@@ -56,6 +57,8 @@ export async function rateGeneratedCode(
   abortSignal: AbortSignal,
   progress: ProgressLogger,
   autoraterModel: string,
+  testResult: TestExecutionResult | null,
+  testRepairAttempts: number,
 ): Promise<CodeAssessmentScore> {
   let categorizedFiles: CategorizedFiles | null = null;
   let totalPoints = 0;
@@ -93,6 +96,8 @@ export async function rateGeneratedCode(
           buildResult,
           serveTestingResult,
           repairAttempts,
+          testResult,
+          testRepairAttempts,
           outputFiles.length,
           axeRepairAttempts,
           ratingsResult,
@@ -173,6 +178,8 @@ function runPerBuildRating(
   buildResult: BuildResult,
   serveResult: ServeTestingResult | null,
   repairAttempts: number,
+  testResult: TestExecutionResult | null,
+  testRepairAttempts: number,
   generatedFileCount: number,
   axeRepairAttempts: number,
   ratingsResult: RatingsResult,
@@ -184,6 +191,8 @@ function runPerBuildRating(
     generatedFileCount,
     axeRepairAttempts,
     ratingsResult,
+    testResult,
+    testRepairAttempts,
   });
 
   // If the rating was skipped (e.g., Axe test wasn't run), create a skipped assessment.
diff --git a/runner/ratings/rating-types.ts b/runner/ratings/rating-types.ts
index fceb104..6dcbf1c 100644
--- a/runner/ratings/rating-types.ts
+++ b/runner/ratings/rating-types.ts
@@ -5,6 +5,7 @@ import type {
   LlmResponseFile,
   PromptDefinition,
   SkippedIndividualAssessment,
+  TestExecutionResult,
   Usage,
 } from '../shared-interfaces.js';
 import {Environment} from '../configuration/environment.js';
@@ -64,6 +65,8 @@ const perBuildRatingSchema = z
           buildResult: z.custom<BuildResult>(),
           serveResult: z.custom<ServeTestingResult | null>(),
           repairAttempts: z.number(),
+          testResult: z.custom<TestExecutionResult | null>(),
+          testRepairAttempts: z.number(),
           axeRepairAttempts: z.number(),
           generatedFileCount: z.number(),
           ratingsResult: z.record(z.custom<IndividualAssessment | SkippedIndividualAssessment>()),
diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts
index 7d94753..a97e927 100644
--- a/runner/ratings/stats.ts
+++ b/runner/ratings/stats.ts
@@ -25,6 +25,10 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
   let successfulInitialBuilds = 0;
   let successfulBuildsAfterRepair = 0;
   let failedBuilds = 0;
+  let successfulInitialTests = 0;
+  let successfulTestsAfterRepair = 0;
+  let failedTests = 0;
+  let noTestsRun = 0;
   let runtimeStats: RuntimeStats | undefined;
   let accessibilityStats:
     | {
@@ -59,6 +63,20 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
       }
     }
 
+    // Calculate test statistics
+    if (result.testResult) {
+      if (result.testResult.passed) {
+        if ((result.testRepairAttempts || 0) === 0) {
+          successfulInitialTests++;
+        } else {
+          successfulTestsAfterRepair++;
+        }
+      } else {
+        failedTests++;
+      }
+    } else {
+      noTestsRun++;
+    }
     if (result.finalAttempt.serveTestingResult?.runtimeErrors != undefined) {
       runtimeStats ??= {appsWithErrors: 0, appsWithoutErrors: 0};
       if (result.finalAttempt.serveTestingResult.runtimeErrors.trim() != '') {
@@ -124,6 +142,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
       failedBuilds,
       errorDistribution: Object.keys(errorDistribution).length > 0 ? errorDistribution : undefined,
     },
+    tests: {
+      successfulInitialTests,
+      successfulTestsAfterRepair,
+      failedTests,
+      noTestsRun,
+    },
     buckets,
     runtime: runtimeStats
       ? {
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index a43cb9a..6ca6251 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -221,8 +221,12 @@ export interface AttemptDetails {
   // Note: May not be set in older reports.
   reasoning?: string;
 
-  /** Whether the build failed during an accessibility repair attempt. */
-  buildFailedDuringA11yRepair?: boolean;
+  /** Whether the build failed during an test repair attempt (a11y or unit). */
+  buildFailedDuringTestRepair?: boolean;
+  /** Result of running tests for this attempt. */
+  testResult?: TestExecutionResult;
+  /** The number of repair attempts made for tests in this attempt. */
+  testRepairAttempts?: number;
 }
 
 /** Statistics related to the build process of the generated applications. */
@@ -237,6 +241,18 @@ export interface RunSummaryBuilds {
   errorDistribution?: Partial<Record<BuildErrorType, number>>;
 }
 
+/** Statistics related to the test process of the generated applications. */
+export interface RunSummaryTests {
+  /** The number of applications that had tests run and all tests passed on the first attempt. */
+  successfulInitialTests: number;
+  /** The number of applications that had tests run and all tests passed after repair attempts. */
+  successfulTestsAfterRepair: number;
+  /** The number of applications that had tests run but tests failed even after repair attempts. */
+  failedTests: number;
+  /** The number of applications that did not have tests run (no test command configured). */
+  noTestsRun: number;
+}
+
 /** Buckets into which scores can be categorized. */
 export interface ScoreBucket {
   /** Plain name of the bucket, e.g. "Good" */
@@ -271,6 +287,8 @@ export interface AggregatedRunStats {
   buckets: ScoreBucket[];
   /** Runtime stats. Not present for reports that didn't request runtime error collection. */
   runtime?: RuntimeStats;
+  /** Test stats. Not present for reports that didn't run tests or older reports. */
+  tests?: RunSummaryTests;
 
   accessibility?: {
     appsWithErrors: number;
@@ -449,6 +467,10 @@ export interface AssessmentResult {
   axeRepairAttempts: number;
   /** Tool requests logs (e.g. MCP requests and responses). */
   toolLogs?: ToolLogEntry[];
+  /** Result of running unit tests. */
+  testResult: TestExecutionResult | null;
+  /** Number of repair attempts for tests. */
+  testRepairAttempts?: number;
 }
 
 /**
@@ -521,3 +543,9 @@ export interface RunGroup {
   /** Runner used to generate code for the runs in the group. */
   runner?: CodegenRunnerInfo;
 }
+
+/** Result of running tests. */
+export interface TestExecutionResult {
+  passed: boolean;
+  output: string;
+}