Skip to content

Commit 8293c67

Browse files
chore: generate a markdown brief for PR comments
1 parent 02cdd0c commit 8293c67

File tree

4 files changed

+91
-29
lines changed

4 files changed

+91
-29
lines changed

.github/workflows/accuracy-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,4 @@ jobs:
4949
if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests'
5050
uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2
5151
with:
52-
path: .accuracy/test-summary.html
52+
path: .accuracy/test-brief.md

resources/test-summary-template.html

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -331,15 +331,15 @@ <h2>📈 Test Results Summary</h2>
331331
<div class="info-grid">
332332
<div class="info-item">
333333
<div class="info-label">Total Prompts Evaluated</div>
334-
<div class="info-value">{{totalTests}}</div>
334+
<div class="info-value">{{totalPrompts}}</div>
335335
</div>
336336
<div class="info-item">
337337
<div class="info-label">Models Tested</div>
338-
<div class="info-value">{{modelsCount}}</div>
338+
<div class="info-value">{{totalModels}}</div>
339339
</div>
340340
<div class="info-item">
341-
<div class="info-label">Evals with 0% Accuracy</div>
342-
<div class="info-value">{{testsWithZeroAccuracy}}</div>
341+
<div class="info-label">Responses with 0% Accuracy</div>
342+
<div class="info-value">{{responsesWithZeroAccuracy}}</div>
343343
</div>
344344
<div class="info-item">
345345
<div class="info-label">Average Accuracy</div>
@@ -368,12 +368,12 @@ <h2>🔄 Baseline Comparison</h2>
368368
<div class="info-value">{{baselineCreatedOn}}</div>
369369
</div>
370370
<div class="info-item">
371-
<div class="info-label">Evals Improved vs Baseline</div>
372-
<div class="info-value">{{evalsImproved}}</div>
371+
<div class="info-label">Responses Improved vs Baseline</div>
372+
<div class="info-value">{{responsesImproved}}</div>
373373
</div>
374374
<div class="info-item">
375-
<div class="info-label">Evals Regressed vs Baseline</div>
376-
<div class="info-value">{{evalsRegressed}}</div>
375+
<div class="info-label">Responses Regressed vs Baseline</div>
376+
<div class="info-value">{{responsesRegressed}}</div>
377377
</div>
378378
</div>
379379
</div>

scripts/accuracy/generate-test-summary.ts

Lines changed: 79 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ import {
99
ModelResponse,
1010
} from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js";
1111
import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js";
12-
import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../../tests/accuracy/sdk/constants.js";
12+
import {
13+
HTML_TEST_SUMMARY_FILE,
14+
HTML_TESTS_SUMMARY_TEMPLATE,
15+
MARKDOWN_TEST_BRIEF_FILE,
16+
} from "../../tests/accuracy/sdk/constants.js";
1317

1418
type ComparableAccuracyResult = Omit<AccuracyResult, "promptResults"> & {
1519
promptAndModelResponses: PromptAndModelResponse[];
@@ -109,15 +113,15 @@ function getTestSummary(comparableResult: ComparableAccuracyResult) {
109113
return {
110114
totalPrompts: new Set(responses.map((r) => r.prompt)).size,
111115
totalModels: new Set(responses.map((r) => `${r.provider} ${r.requestedModel}`)).size,
112-
testsWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0),
113-
testsWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75),
114-
testsWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 100),
116+
responsesWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0),
117+
responsesWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75),
118+
responsesWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 1),
115119
averageAccuracy:
116120
responses.length > 0 ? responses.reduce((sum, r) => sum + r.toolCallingAccuracy, 0) / responses.length : 0,
117-
evalsImproved: responses.filter(
121+
responsesImproved: responses.filter(
118122
(r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy > r.baselineToolAccuracy
119123
).length,
120-
evalsRegressed: responses.filter(
124+
responsesRegressed: responses.filter(
121125
(r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy < r.baselineToolAccuracy
122126
).length,
123127
reportGeneratedOn: new Date().toLocaleString(),
@@ -172,22 +176,74 @@ async function generateHtmlReport(
172176
accuracyRunStatus: formatRunStatus(comparableResult.runStatus),
173177
reportGeneratedOn: testSummary.reportGeneratedOn,
174178
createdOn: testSummary.resultCreatedOn,
175-
totalTests: String(testSummary.totalPrompts),
176-
modelsCount: String(testSummary.totalModels),
177-
testsWithZeroAccuracy: String(testSummary.testsWithZeroAccuracy.length),
179+
totalPrompts: String(testSummary.totalPrompts),
180+
totalModels: String(testSummary.totalModels),
181+
responsesWithZeroAccuracy: String(testSummary.responsesWithZeroAccuracy.length),
178182
averageAccuracy: formatAccuracy(testSummary.averageAccuracy),
179183
baselineCommitSHA: baselineInfo?.commitSHA || "-",
180184
baselineAccuracyRunId: baselineInfo?.accuracyRunId || "-",
181185
baselineAccuracyRunStatus: baselineInfo?.accuracyRunStatus
182186
? formatRunStatus(baselineInfo?.accuracyRunStatus)
183187
: "-",
184188
baselineCreatedOn: baselineInfo?.createdOn || "-",
185-
evalsImproved: baselineInfo ? String(testSummary.evalsImproved) : "-",
186-
evalsRegressed: baselineInfo ? String(testSummary.evalsRegressed) : "-",
189+
responsesImproved: baselineInfo ? String(testSummary.responsesImproved) : "-",
190+
responsesRegressed: baselineInfo ? String(testSummary.responsesRegressed) : "-",
187191
tableRows,
188192
});
189193
}
190194

195+
function generateMarkdownBrief(
196+
comparableResult: ComparableAccuracyResult,
197+
testSummary: ReturnType<typeof getTestSummary>,
198+
baselineInfo: BaselineRunInfo | null
199+
): string {
200+
const markdownTexts = [
201+
"# 📊 Accuracy Test Results",
202+
"## 📈 Summary",
203+
"| Metric | Value |",
204+
"|--------|-------|",
205+
`| **Commit SHA** | \`${comparableResult.commitSHA}\` |`,
206+
`| **Run ID** | \`${comparableResult.runId}\` |`,
207+
`| **Status** | ${comparableResult.runStatus} |`,
208+
`| **Total Prompts Evaluated** | ${testSummary.totalPrompts} |`,
209+
`| **Models Tested** | ${testSummary.totalModels} |`,
210+
`| **Average Accuracy** | ${formatAccuracy(testSummary.averageAccuracy)} |`,
211+
`| **Responses with 0% Accuracy** | ${testSummary.responsesWithZeroAccuracy.length} |`,
212+
`| **Responses with 75% Accuracy** | ${testSummary.responsesWith75Accuracy.length} |`,
213+
`| **Responses with 100% Accuracy** | ${testSummary.responsesWith100Accuracy.length} |`,
214+
"",
215+
];
216+
217+
if (baselineInfo) {
218+
markdownTexts.push(
219+
...[
220+
"## 📊 Baseline Comparison",
221+
"|--------|-------|",
222+
`| **Baseline Commit** | \`${baselineInfo.commitSHA}\` |`,
223+
`| **Baseline Run ID** | \`${baselineInfo.accuracyRunId}\` |`,
224+
`| **Baseline Run Status** | \`${baselineInfo.accuracyRunStatus}\` |`,
225+
`| **Responses Improved** | ${testSummary.responsesImproved} |`,
226+
`| **Responses Regressed** | ${testSummary.responsesRegressed} |`,
227+
"",
228+
]
229+
);
230+
}
231+
232+
const { GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID } = process.env;
233+
const githubRunUrl =
234+
GITHUB_SERVER_URL && GITHUB_REPOSITORY && GITHUB_RUN_ID
235+
? `${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}`
236+
: null;
237+
238+
const reportLinkText = githubRunUrl
239+
? `📎 **[Download Full HTML Report](${githubRunUrl})** - Look for the \`accuracy-test-summary\` artifact for detailed results.`
240+
: `📎 **Full HTML Report**: \`${HTML_TEST_SUMMARY_FILE}\``;
241+
242+
markdownTexts.push(...["---", reportLinkText, "", `*Report generated on: ${testSummary.reportGeneratedOn}*`]);
243+
244+
return markdownTexts.join("\n");
245+
}
246+
191247
async function generateTestSummary() {
192248
const storage = getAccuracyResultStorage();
193249
try {
@@ -244,25 +300,29 @@ async function generateTestSummary() {
244300
),
245301
};
246302

303+
// Ensure that our writable path actually exist.
304+
await mkdir(path.dirname(HTML_TEST_SUMMARY_FILE), { recursive: true });
305+
247306
console.log(`\n📊 Generating test summary for accuracy run: ${accuracyRunId}\n`);
248307
const testSummary = getTestSummary(comparableAccuracyResult);
249-
const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo);
250308

251-
// Ensure that our writable path actually exist.
252-
await mkdir(path.dirname(HTML_TESTS_SUMMARY_FILE), { recursive: true });
253-
await writeFile(HTML_TESTS_SUMMARY_FILE, htmlReport, "utf8");
309+
const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo);
310+
await writeFile(HTML_TEST_SUMMARY_FILE, htmlReport, "utf8");
311+
console.log(`✅ HTML report generated: ${HTML_TEST_SUMMARY_FILE}`);
254312

255-
console.log(`✅ HTML report generated: ${HTML_TESTS_SUMMARY_FILE}`);
313+
const markdownBrief = generateMarkdownBrief(comparableAccuracyResult, testSummary, baselineInfo);
314+
await writeFile(MARKDOWN_TEST_BRIEF_FILE, markdownBrief, "utf8");
315+
console.log(`✅ Markdown brief generated: ${MARKDOWN_TEST_BRIEF_FILE}`);
256316

257317
console.log(`\n📈 Summary:`);
258318
console.log(` Total prompts evaluated: ${testSummary.totalPrompts}`);
259319
console.log(` Models tested: ${testSummary.totalModels}`);
260-
console.log(` Evals with 0% accuracy: ${testSummary.testsWithZeroAccuracy.length}`);
320+
console.log(` Responses with 0% accuracy: ${testSummary.responsesWithZeroAccuracy.length}`);
261321

262322
if (baselineCommit) {
263323
console.log(` Baseline commit: ${baselineCommit}`);
264-
console.log(` Evals improved vs baseline: ${testSummary.evalsImproved}`);
265-
console.log(` Evals regressed vs baseline: ${testSummary.evalsRegressed}`);
324+
console.log(` Responses improved vs baseline: ${testSummary.responsesImproved}`);
325+
console.log(` Responses regressed vs baseline: ${testSummary.responsesRegressed}`);
266326
}
267327
} catch (error) {
268328
console.error("Error generating test summary:", error);

tests/accuracy/sdk/constants.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results");
1919

2020
export const LATEST_ACCURACY_RUN_NAME = "latest-run";
2121

22-
export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html");
22+
export const HTML_TEST_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html");
23+
24+
export const MARKDOWN_TEST_BRIEF_FILE = path.join(GENERATED_ASSETS_DIR, "test-brief.md");
2325

2426
export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html");

0 commit comments

Comments
 (0)