@@ -9,7 +9,11 @@ import {
99 ModelResponse ,
1010} from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js" ;
1111import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js" ;
12- import { HTML_TESTS_SUMMARY_FILE , HTML_TESTS_SUMMARY_TEMPLATE } from "../../tests/accuracy/sdk/constants.js" ;
12+ import {
13+ HTML_TEST_SUMMARY_FILE ,
14+ HTML_TESTS_SUMMARY_TEMPLATE ,
15+ MARKDOWN_TEST_BRIEF_FILE ,
16+ } from "../../tests/accuracy/sdk/constants.js" ;
1317
1418type ComparableAccuracyResult = Omit < AccuracyResult , "promptResults" > & {
1519 promptAndModelResponses : PromptAndModelResponse [ ] ;
@@ -109,15 +113,15 @@ function getTestSummary(comparableResult: ComparableAccuracyResult) {
109113 return {
110114 totalPrompts : new Set ( responses . map ( ( r ) => r . prompt ) ) . size ,
111115 totalModels : new Set ( responses . map ( ( r ) => `${ r . provider } ${ r . requestedModel } ` ) ) . size ,
112- testsWithZeroAccuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0 ) ,
113- testsWith75Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0.75 ) ,
114- testsWith100Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 100 ) ,
116+ responsesWithZeroAccuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0 ) ,
117+ responsesWith75Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0.75 ) ,
118+ responsesWith100Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 1 ) ,
115119 averageAccuracy :
116120 responses . length > 0 ? responses . reduce ( ( sum , r ) => sum + r . toolCallingAccuracy , 0 ) / responses . length : 0 ,
117- evalsImproved : responses . filter (
121+ responsesImproved : responses . filter (
118122 ( r ) => typeof r . baselineToolAccuracy === "number" && r . toolCallingAccuracy > r . baselineToolAccuracy
119123 ) . length ,
120- evalsRegressed : responses . filter (
124+ responsesRegressed : responses . filter (
121125 ( r ) => typeof r . baselineToolAccuracy === "number" && r . toolCallingAccuracy < r . baselineToolAccuracy
122126 ) . length ,
123127 reportGeneratedOn : new Date ( ) . toLocaleString ( ) ,
@@ -172,22 +176,74 @@ async function generateHtmlReport(
172176 accuracyRunStatus : formatRunStatus ( comparableResult . runStatus ) ,
173177 reportGeneratedOn : testSummary . reportGeneratedOn ,
174178 createdOn : testSummary . resultCreatedOn ,
175- totalTests : String ( testSummary . totalPrompts ) ,
176- modelsCount : String ( testSummary . totalModels ) ,
177- testsWithZeroAccuracy : String ( testSummary . testsWithZeroAccuracy . length ) ,
179+ totalPrompts : String ( testSummary . totalPrompts ) ,
180+ totalModels : String ( testSummary . totalModels ) ,
181+ responsesWithZeroAccuracy : String ( testSummary . responsesWithZeroAccuracy . length ) ,
178182 averageAccuracy : formatAccuracy ( testSummary . averageAccuracy ) ,
179183 baselineCommitSHA : baselineInfo ?. commitSHA || "-" ,
180184 baselineAccuracyRunId : baselineInfo ?. accuracyRunId || "-" ,
181185 baselineAccuracyRunStatus : baselineInfo ?. accuracyRunStatus
182186 ? formatRunStatus ( baselineInfo ?. accuracyRunStatus )
183187 : "-" ,
184188 baselineCreatedOn : baselineInfo ?. createdOn || "-" ,
185- evalsImproved : baselineInfo ? String ( testSummary . evalsImproved ) : "-" ,
186- evalsRegressed : baselineInfo ? String ( testSummary . evalsRegressed ) : "-" ,
189+ responsesImproved : baselineInfo ? String ( testSummary . responsesImproved ) : "-" ,
190+ responsesRegressed : baselineInfo ? String ( testSummary . responsesRegressed ) : "-" ,
187191 tableRows,
188192 } ) ;
189193}
190194
195+ function generateMarkdownBrief (
196+ comparableResult : ComparableAccuracyResult ,
197+ testSummary : ReturnType < typeof getTestSummary > ,
198+ baselineInfo : BaselineRunInfo | null
199+ ) : string {
200+ const markdownTexts = [
201+ "# 📊 Accuracy Test Results" ,
202+ "## 📈 Summary" ,
203+ "| Metric | Value |" ,
204+ "|--------|-------|" ,
205+ `| **Commit SHA** | \`${ comparableResult . commitSHA } \` |` ,
206+ `| **Run ID** | \`${ comparableResult . runId } \` |` ,
207+ `| **Status** | ${ comparableResult . runStatus } |` ,
208+ `| **Total Prompts Evaluated** | ${ testSummary . totalPrompts } |` ,
209+ `| **Models Tested** | ${ testSummary . totalModels } |` ,
210+ `| **Average Accuracy** | ${ formatAccuracy ( testSummary . averageAccuracy ) } |` ,
211+ `| **Responses with 0% Accuracy** | ${ testSummary . responsesWithZeroAccuracy . length } |` ,
212+ `| **Responses with 75% Accuracy** | ${ testSummary . responsesWith75Accuracy . length } |` ,
213+ `| **Responses with 100% Accuracy** | ${ testSummary . responsesWith100Accuracy . length } |` ,
214+ "" ,
215+ ] ;
216+
217+ if ( baselineInfo ) {
218+ markdownTexts . push (
219+ ...[
220+ "## 📊 Baseline Comparison" ,
221+ "|--------|-------|" ,
222+ `| **Baseline Commit** | \`${ baselineInfo . commitSHA } \` |` ,
223+ `| **Baseline Run ID** | \`${ baselineInfo . accuracyRunId } \` |` ,
224+ `| **Baseline Run Status** | \`${ baselineInfo . accuracyRunStatus } \` |` ,
225+ `| **Responses Improved** | ${ testSummary . responsesImproved } |` ,
226+ `| **Responses Regressed** | ${ testSummary . responsesRegressed } |` ,
227+ "" ,
228+ ]
229+ ) ;
230+ }
231+
232+ const { GITHUB_SERVER_URL , GITHUB_REPOSITORY , GITHUB_RUN_ID } = process . env ;
233+ const githubRunUrl =
234+ GITHUB_SERVER_URL && GITHUB_REPOSITORY && GITHUB_RUN_ID
235+ ? `${ GITHUB_SERVER_URL } /${ GITHUB_REPOSITORY } /actions/runs/${ GITHUB_RUN_ID } `
236+ : null ;
237+
238+ const reportLinkText = githubRunUrl
239+ ? `📎 **[Download Full HTML Report](${ githubRunUrl } )** - Look for the \`accuracy-test-summary\` artifact for detailed results.`
240+ : `📎 **Full HTML Report**: \`${ HTML_TEST_SUMMARY_FILE } \`` ;
241+
242+ markdownTexts . push ( ...[ "---" , reportLinkText , "" , `*Report generated on: ${ testSummary . reportGeneratedOn } *` ] ) ;
243+
244+ return markdownTexts . join ( "\n" ) ;
245+ }
246+
191247async function generateTestSummary ( ) {
192248 const storage = getAccuracyResultStorage ( ) ;
193249 try {
@@ -244,25 +300,29 @@ async function generateTestSummary() {
244300 ) ,
245301 } ;
246302
303+ // Ensure that our writable path actually exist.
304+ await mkdir ( path . dirname ( HTML_TEST_SUMMARY_FILE ) , { recursive : true } ) ;
305+
247306 console . log ( `\n📊 Generating test summary for accuracy run: ${ accuracyRunId } \n` ) ;
248307 const testSummary = getTestSummary ( comparableAccuracyResult ) ;
249- const htmlReport = await generateHtmlReport ( comparableAccuracyResult , testSummary , baselineInfo ) ;
250308
251- // Ensure that our writable path actually exist.
252- await mkdir ( path . dirname ( HTML_TESTS_SUMMARY_FILE ) , { recursive : true } ) ;
253- await writeFile ( HTML_TESTS_SUMMARY_FILE , htmlReport , "utf8" ) ;
309+ const htmlReport = await generateHtmlReport ( comparableAccuracyResult , testSummary , baselineInfo ) ;
310+ await writeFile ( HTML_TEST_SUMMARY_FILE , htmlReport , "utf8" ) ;
311+ console . log ( `✅ HTML report generated: ${ HTML_TEST_SUMMARY_FILE } ` ) ;
254312
255- console . log ( `✅ HTML report generated: ${ HTML_TESTS_SUMMARY_FILE } ` ) ;
313+ const markdownBrief = generateMarkdownBrief ( comparableAccuracyResult , testSummary , baselineInfo ) ;
314+ await writeFile ( MARKDOWN_TEST_BRIEF_FILE , markdownBrief , "utf8" ) ;
315+ console . log ( `✅ Markdown brief generated: ${ MARKDOWN_TEST_BRIEF_FILE } ` ) ;
256316
257317 console . log ( `\n📈 Summary:` ) ;
258318 console . log ( ` Total prompts evaluated: ${ testSummary . totalPrompts } ` ) ;
259319 console . log ( ` Models tested: ${ testSummary . totalModels } ` ) ;
260- console . log ( ` Evals with 0% accuracy: ${ testSummary . testsWithZeroAccuracy . length } ` ) ;
320+ console . log ( ` Responses with 0% accuracy: ${ testSummary . responsesWithZeroAccuracy . length } ` ) ;
261321
262322 if ( baselineCommit ) {
263323 console . log ( ` Baseline commit: ${ baselineCommit } ` ) ;
264- console . log ( ` Evals improved vs baseline: ${ testSummary . evalsImproved } ` ) ;
265- console . log ( ` Evals regressed vs baseline: ${ testSummary . evalsRegressed } ` ) ;
324+ console . log ( ` Responses improved vs baseline: ${ testSummary . responsesImproved } ` ) ;
325+ console . log ( ` Responses regressed vs baseline: ${ testSummary . responsesRegressed } ` ) ;
266326 }
267327 } catch ( error ) {
268328 console . error ( "Error generating test summary:" , error ) ;
0 commit comments