fix: handle errors thrown during rating

crisbeto · crisbeto · commit 074eb08c6b3a · 2025-09-18T14:18:55.000+02:00
Currently if a rating throws an error, it interrupts the entire eval. These changes catch the error and mark the rating as skipped instead.
diff --git a/runner/ratings/built-in-ratings/valid-css-rating.ts b/runner/ratings/built-in-ratings/valid-css-rating.ts
@@ -15,7 +15,7 @@ export const validCssRating: PerFileRating = {
   kind: RatingKind.PER_FILE,
   id: 'common-valid-css',
   filter: PerFileRatingContentType.CSS,
-  rate: async (code, _filePath) => {
+  rate: async (code) => {
     const linterResult = await stylelint.lint({
       code: code,
       cwd: import.meta.dirname,
diff --git a/runner/ratings/rate-code.ts b/runner/ratings/rate-code.ts
@@ -84,19 +84,19 @@ export async function rateGeneratedCode(
   for (const current of currentPromptDef.ratings) {
     let result: IndividualAssessment | SkippedIndividualAssessment;
 
-    if (current.kind === RatingKind.PER_BUILD) {
-      result = runPerBuildRating(
-        current,
-        buildResult,
-        repairAttempts,
-        outputFiles.length,
-        axeRepairAttempts
-      );
-    } else if (current.kind === RatingKind.PER_FILE) {
-      categorizedFiles ??= splitFilesIntoCategories(outputFiles);
-      result = await runPerFileRating(current, categorizedFiles);
-    } else if (current.kind === RatingKind.LLM_BASED) {
-      try {
+    try {
+      if (current.kind === RatingKind.PER_BUILD) {
+        result = runPerBuildRating(
+          current,
+          buildResult,
+          repairAttempts,
+          outputFiles.length,
+          axeRepairAttempts
+        );
+      } else if (current.kind === RatingKind.PER_FILE) {
+        categorizedFiles ??= splitFilesIntoCategories(outputFiles);
+        result = await runPerFileRating(current, categorizedFiles);
+      } else if (current.kind === RatingKind.LLM_BASED) {
         result = await runLlmBasedRating(
           environment,
           current,
@@ -109,14 +109,14 @@ export async function rateGeneratedCode(
           axeRepairAttempts,
           abortSignal
         );
-      } catch (error) {
-        result = getSkippedAssessment(
-          current,
-          `Error during execution:\n${error}`
-        );
+      } else {
+        throw new UserFacingError(`Unsupported rating type ${current}`);
       }
-    } else {
-      throw new UserFacingError(`Unsupported rating type ${current}`);
+    } catch (error) {
+      result = getSkippedAssessment(
+        current,
+        `Error during execution:\n${error}`
+      );
     }
 
     if (result.state === IndividualAssessmentState.EXECUTED && result.usage) {