angular · atscott · Sep 29, 2025 · devversion · Oct 6, 2025
@@ -179,3 +179,8 @@ Defaults to `<package manager> run build`.
 
 Command used to start a local dev server as a part of the evaluation.
 Defaults to `<package manager> run start --port 0`.
+
+### `testCommand`
+
+Command used to run tests against the generated code. If this property is not provided, tests will not be run. The command should exit with code 0 on success and a non-zero exit code on failure. The output from the command (both `stdout` and `stderr`) is captured and used for repair attempts if the tests fail. The test command will time out after 4 minutes.
+
@@ -73,6 +73,20 @@ <h3 class="chart-title">
             <stacked-bar-chart [data]="buildsAsGraphData(overview.stats.builds)" [compact]="true" />
           </div>
         </div>
+        @if (overview.stats.tests) {
+          <div class="chart-container test-results-details">
+            <h3 class="chart-title">
+              <span class="material-symbols-outlined"> quiz </span>
+              <span>Tests</span>
+            </h3>
+            <div class="summary-card-item">
+              <stacked-bar-chart
+                [data]="testsAsGraphData(overview.stats.tests)"
+                [compact]="true"
+              />
+            </div>
+          </div>
+        }
         @if (overview.stats.runtime) {
           <div class="chart-container">
             <h3 class="chart-title">
@@ -281,9 +295,19 @@ <h2>Generated applications</h2>
                   <span class="status-badge error">Initial build failed</span>
                 }
 
-                @if (hasBuildFailureDuringA11yRepair(result)) {
+                @if (hasBuildFailureDuringTestRepair(result)) {
                   <span class="status-badge error">Build failed after a11y repair</span>
-                  <span class="status-badge error">Build failed after a11y repair</span>
+                  <span class="status-badge error">Build failed after a11y/test repair</span>
-                  <span class="status-badge error">Build failed after a11y repair</span>
+                  <span class="status-badge error">Build failed after a11y/test repair</span>
                 }
+                <!-- Test status badges -->
+                @if (finalAttempt.testResult) {
+                  @if (finalAttempt.testResult.passed) {
+                    @if ((result.testRepairAttempts || 0) > 0) {
+                      <span class="status-badge warning">Tests passed after repair</span>
+                    }
+                  } @else {
+                    <span class="status-badge error">Tests failed</span>
+                  }
+                }
               </div>
             </div>
           </expansion-panel-header>
@@ -355,12 +379,36 @@ <h5>
                 </div>
               </div>
 
+              @if (result.testResult) {
+                <div class="app-details-section">
+                  <h4>Test Results</h4>
+                  <div class="test-summary">
+                    @if (result.testResult.passed) {
+                      <span class="status-text success">✔ Tests passed</span>
+                      @if ((result.testRepairAttempts || 0) > 0) {
+                        <span class="status-text">&nbsp;after {{ result.testRepairAttempts }} repair attempt(s)</span>
+                      }
+                    } @else {
+                      <span class="status-text error">✘ Tests failed</span>
+                    }
+                  </div>
+
+                  @if (result.testResult.output && !result.testResult.passed) {
+                    <details class="test-output-button">
+                      <summary class="neutral-button">See Test Output</summary>
+                      <pre class="callout neutral code">{{ result.testResult.output }}</pre>
+                    </details>
+                  }
+                </div>
+              }
+
               <div class="app-details-section">
                 <h4>Additional info</h4>
                 @for (attempt of result.attemptDetails; track attempt) {
                   @let isBuilt = attempt.buildResult.status === 'success';
                   @let axeViolations = attempt.serveTestingResult?.axeViolations;
                   @let hasAxeViolations = axeViolations && axeViolations.length > 0;
+                  @let testsFailed = attempt.testResult?.passed === false;
 
                   <expansion-panel #expansionPanel>
                     <expansion-panel-header>
@@ -385,6 +433,15 @@ <h4>Additional info</h4>
                           >A11y</span
                         >
                       }
+
+                      @if (attempt.testResult) {
+                        <span
+                          class="status-badge"
+                          [class.error]="!attempt.testResult.passed"
+                          [class.success]="attempt.testResult.passed"
+                          >Tests</span
+                        >
+                      }
                     </expansion-panel-header>
 
                     @if (expansionPanel.opened()) {
@@ -421,6 +478,11 @@ <h4>A11y Violations</h4>
                         </pre>
                       }
 
+                      @if (testsFailed) {
+                        <h4>Failed Tests</h4>
+                        <pre class="callout neutral code">{{ attempt.testResult?.output }}</pre>
+                      }
+
                       <h4>Generated Code</h4>
 
                       @for (file of attempt.outputFiles; track file) {

@@ -25,6 +25,7 @@ import {
   LlmResponseFile,
   RunInfo,
   RunSummaryBuilds,
+  RunSummaryTests,
   RuntimeStats,
   ScoreBucket,
   SkippedIndividualAssessment,
@@ -271,6 +272,31 @@ export class ReportViewer {
     ];
   }
 
+  protected testsAsGraphData(tests: RunSummaryTests): StackedBarChartData {
+    return [
+      {
+        label: 'Passed',
+        color: ScoreCssVariable.excellent,
+        value: tests.successfulInitialTests,
+      },
+      {
+        label: 'Passed after repair',
+        color: ScoreCssVariable.great,
+        value: tests.successfulTestsAfterRepair,
+      },
+      {
+        label: 'Failed',
+        color: ScoreCssVariable.poor,
+        value: tests.failedTests,
+      },
+      {
+        label: 'No tests run',
+        color: ScoreCssVariable.neutral,
+        value: tests.noTestsRun,
+      },
+    ];
+  }
+
   protected checksAsGraphData(buckets: ScoreBucket[]): StackedBarChartData {
     return buckets.map(b => ({
       label: b.nameWithLabels,
@@ -427,7 +453,7 @@ export class ReportViewer {
     return `wcs run --prompt=${result.promptDef.name} --env=<path to ${report.details.summary.environmentId} config>`;
   }
 
-  protected hasBuildFailureDuringA11yRepair(result: AssessmentResult): boolean {
-    return result.attemptDetails.some(attempt => attempt.buildFailedDuringA11yRepair);
+  protected hasBuildFailureDuringTestRepair(result: AssessmentResult): boolean {
+    return result.attemptDetails.some(attempt => attempt.buildFailedDuringTestRepair);
   }
 }
@@ -1,8 +1,6 @@
 import z from 'zod';
 import {ratingSchema} from '../ratings/rating-types.js';
 import {MultiStepPrompt} from './multi-step-prompt.js';
-import {mcpServerOptionsSchema} from '../codegen/llm-runner.js';
-import {getPossiblePackageManagers} from './environment-config.js';
 
 export const baseEnvironmentConfigSchema = z.strictObject({
   /** Display name for the environment. */

@@ -15,11 +15,6 @@ const environmentConfigSchema = z.union([
  */
 export type EnvironmentConfig = z.infer<typeof environmentConfigSchema>;
 
-/** Package managers that are currently supported. */
-export function getPossiblePackageManagers() {
-  return ['npm', 'pnpm', 'yarn'] as const;
-}
-
 /** Asserts that the specified data is a valid environment config. */
 export function assertIsEnvironmentConfig(value: unknown): asserts value is EnvironmentConfig {
   const validationResult = environmentConfigSchema.safeParse(value);

@@ -3,7 +3,7 @@ import z from 'zod';
 import {LlmRunner, McpServerOptions, mcpServerOptionsSchema} from '../codegen/llm-runner.js';
 import {LocalGateway} from '../orchestration/gateways/local_gateway.js';
 import {BaseEnvironment} from './base-environment.js';
-import {EnvironmentConfig, getPossiblePackageManagers} from './environment-config.js';
+import {getPossiblePackageManagers} from './package-managers.js';
 import {baseEnvironmentConfigSchema} from './base-environment-config.js';
 
 export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
@@ -28,6 +28,10 @@ export const localEnvironmentConfigSchema = baseEnvironmentConfigSchema.extend({
    * Defaults to `<package manager> run start --port 0`.
    */
   serveCommand: z.string().optional(),
+  /**
+   * Command to run when testing the code.
+   */
+  testCommand: z.string().optional(),
   /**
    * Whether to skip installing dependencies when running evals in the environment.
    * Useful if you're managing dependencies yourself.
@@ -47,6 +51,8 @@ export class LocalEnvironment extends BaseEnvironment {
   readonly buildCommand: string;
   /** Command to run when starting a development server inside the app. */
   readonly serveCommand: string;
+  /** Command to run when starting tests inside the app. */
+  readonly testCommand: string | null;
   /**
    * Absolute path at which files specific to this environment are located. Will be merged in
    * with the files from the `projectTemplatePath` to get the final project structure.
@@ -82,6 +88,7 @@ export class LocalEnvironment extends BaseEnvironment {
     this.installCommand = `${packageManager} install --silent`;
     this.buildCommand = config.buildCommand || `${packageManager} run build`;
     this.serveCommand = config.serveCommand || this.getDefaultServeCommand(packageManager);
+    this.testCommand = config.testCommand ?? null;
     this.projectTemplatePath = projectTemplatePath;
     this.sourceDirectory = sourceDirectory;
     this.mcpServerOptions = config.mcpServers || [];

@@ -0,0 +1,4 @@
+/** Package managers that are currently supported. */
+export function getPossiblePackageManagers() {
+  return ['npm', 'pnpm', 'yarn'] as const;
+}
@@ -36,7 +36,7 @@ interface Options {
   enableUserJourneyTesting?: boolean;
   enableAutoCsp?: boolean;
   autoraterModel?: string;
-  a11yRepairAttempts?: number;
+  testRepairAttempts?: number;
   logging?: 'text-only' | 'dynamic';
 }
 
@@ -148,10 +148,11 @@ function builder(argv: Argv): Argv<Options> {
         default: DEFAULT_AUTORATER_MODEL_NAME,
         description: 'Model to use when automatically rating generated code',
       })
-      .option('a11y-repair-attempts', {
+      .option('test-repair-attempts', {
         type: 'number',
         default: 0,
-        description: 'Number of repair attempts for discovered a11y violations',
+        description:
+          'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
       })
       .strict()
       .version(false)
@@ -196,7 +197,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
       logging: cliArgs.logging,
       autoraterModel: cliArgs.autoraterModel,
       skipAiSummary: cliArgs.skipAiSummary,
-      a11yRepairAttempts: cliArgs.a11yRepairAttempts,
+      testRepairAttempts: cliArgs.testRepairAttempts,
     });
 
     logReportToConsole(runInfo);