Skip to content

Commit a686fc1

Browse files
committed
refactor: adjust grouping labels for more insightful rubric analysis
* Functionality is pretty broad. We need better insight into more focused rubrics. Obviously the individual checks could be pulled directly- but adding the labels is trivial and useful in general. * Adds more focused labels where useful. * Removes sufficient code checks from the functionality rubric as they are highly "impactful from scoring" but not necessarily represent actual functionality. E.g. if there is always at least a file generated, it would significantly drag up the score because it's a 100% high impact category. Maybe in the future these ratings need to be adjusted to actually represent "functionality" better. If build is failing— they could return 0% success, or just be "skipped".
1 parent b0ce651 commit a686fc1

9 files changed

+7
-9
lines changed

runner/ratings/built-in-ratings/code-quality-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const codeQualityRating: LLMBasedRating = {
77
name: 'Code Quality (LLM-rated)',
88
description: `Rates the app's source code via LLM`,
99
category: RatingCategory.MEDIUM_IMPACT,
10-
groupingLabels: ['llm-judge'],
10+
groupingLabels: ['llm-judge', 'llm-rated-code-quality'],
1111
id: 'common-autorater-code-quality',
1212
scoreReduction: '30%',
1313
rate: async ctx => {

runner/ratings/built-in-ratings/no-runtime-errors-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const noRuntimeExceptionsRating: PerBuildRating = {
77
description: "Ensures the app doesn't have runtime exceptions.",
88
kind: RatingKind.PER_BUILD,
99
category: RatingCategory.HIGH_IMPACT,
10-
groupingLabels: ['functionality', 'running-app-checks'],
10+
groupingLabels: ['functionality', 'runtime-errors', 'running-app-checks'],
1111
scoreReduction: '50%',
1212
id: 'common-no-runtime-errors',
1313
rate: ({buildResult, serveResult}) => ({

runner/ratings/built-in-ratings/successful-build-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ export const successfulBuildRating: PerBuildRating = {
88
id: 'common-successful-build',
99
kind: RatingKind.PER_BUILD,
1010
category: RatingCategory.HIGH_IMPACT,
11-
groupingLabels: ['functionality'],
11+
groupingLabels: ['functionality', 'build-success'],
1212
scoreReduction: '50%',
1313
// Reduce the amount of points in case we've built the code with a few repair attempts.
1414
rate: ({buildResult, repairAttempts}) => ({

runner/ratings/built-in-ratings/successful-tests-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const successfulTestsRating: PerBuildRating = {
77
id: 'common-successful-tests',
88
kind: RatingKind.PER_BUILD,
99
category: RatingCategory.MEDIUM_IMPACT,
10-
groupingLabels: ['functionality'],
10+
groupingLabels: ['functionality', 'project-tests'],
1111
scoreReduction: '30%',
1212
// Reduce the amount of points in case we've had test repair attempts.
1313
rate: ({testResult, testRepairAttempts}) => {

runner/ratings/built-in-ratings/sufficient-code-size-rating.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ export const sufficientCodeSizeRating: PerFileRating = {
1010
name: 'Sufficient Code Size (over 50b)',
1111
description: 'Ensures the generated code is not trivially small (e.g. < 50b).',
1212
category: RatingCategory.HIGH_IMPACT,
13-
groupingLabels: ['functionality'],
1413
id: 'common-generated-code-size',
1514
scoreReduction: '30%',
1615
kind: RatingKind.PER_FILE,

runner/ratings/built-in-ratings/sufficient-generated-files-rating.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ export const sufficientGeneratedFilesRating: PerBuildRating = {
55
name: 'Sufficient number of generated files',
66
description: 'Ensures that the LLM produced at least one file.',
77
category: RatingCategory.HIGH_IMPACT,
8-
groupingLabels: ['functionality'],
98
id: 'common-generated-file-count',
109
scoreReduction: '100%',
1110
kind: RatingKind.PER_BUILD,

runner/ratings/built-in-ratings/user-journeys-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ export const userJourneysRating: PerBuildRating = {
1313
if (serveResult === null || serveResult.userJourneyAgentOutput === null) {
1414
return {
1515
state: RatingState.SKIPPED,
16-
message: 'Was not enabled for this run',
16+
message: 'Not enabled for this run.',
1717
};
1818
}
1919

runner/ratings/built-in-ratings/valid-css-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ export const validCssRating: PerFileRating = {
1111
name: 'Valid CSS',
1212
description: 'Ensures that the generated CSS code is valid',
1313
category: RatingCategory.MEDIUM_IMPACT,
14-
groupingLabels: ['functionality', 'styling'],
14+
groupingLabels: ['functionality', 'styling', 'css-validity'],
1515
scoreReduction: '20%',
1616
kind: RatingKind.PER_FILE,
1717
id: 'common-valid-css',

runner/ratings/built-in-ratings/visual-appearance-rating.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export const visualAppearanceRating: LLMBasedRating = {
99
name: 'UI & Visual appearance (LLM-Rated)',
1010
description: 'Rates the app based on its visuals (UI visuals and feature completeness).',
1111
category: RatingCategory.MEDIUM_IMPACT,
12-
groupingLabels: ['llm-judge', 'visual-appearance', 'running-app-checks'],
12+
groupingLabels: ['llm-judge', 'llm-rated-visual-appearance', 'running-app-checks'],
1313
scoreReduction: '30%',
1414
id: 'common-autorater-visuals',
1515
rate: async ctx => {

0 commit comments

Comments
 (0)