@@ -9,7 +9,11 @@ import {
9
9
ModelResponse ,
10
10
} from "../../tests/accuracy/sdk/accuracy-result-storage/result-storage.js" ;
11
11
import { getCommitSHA } from "../../tests/accuracy/sdk/git-info.js" ;
12
- import { HTML_TESTS_SUMMARY_FILE , HTML_TESTS_SUMMARY_TEMPLATE } from "../../tests/accuracy/sdk/constants.js" ;
12
+ import {
13
+ HTML_TEST_SUMMARY_FILE ,
14
+ HTML_TESTS_SUMMARY_TEMPLATE ,
15
+ MARKDOWN_TEST_BRIEF_FILE ,
16
+ } from "../../tests/accuracy/sdk/constants.js" ;
13
17
14
18
type ComparableAccuracyResult = Omit < AccuracyResult , "promptResults" > & {
15
19
promptAndModelResponses : PromptAndModelResponse [ ] ;
@@ -109,15 +113,15 @@ function getTestSummary(comparableResult: ComparableAccuracyResult) {
109
113
return {
110
114
totalPrompts : new Set ( responses . map ( ( r ) => r . prompt ) ) . size ,
111
115
totalModels : new Set ( responses . map ( ( r ) => `${ r . provider } ${ r . requestedModel } ` ) ) . size ,
112
- testsWithZeroAccuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0 ) ,
113
- testsWith75Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0.75 ) ,
114
- testsWith100Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 100 ) ,
116
+ responsesWithZeroAccuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0 ) ,
117
+ responsesWith75Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 0.75 ) ,
118
+ responsesWith100Accuracy : responses . filter ( ( r ) => r . toolCallingAccuracy === 1 ) ,
115
119
averageAccuracy :
116
120
responses . length > 0 ? responses . reduce ( ( sum , r ) => sum + r . toolCallingAccuracy , 0 ) / responses . length : 0 ,
117
- evalsImproved : responses . filter (
121
+ responsesImproved : responses . filter (
118
122
( r ) => typeof r . baselineToolAccuracy === "number" && r . toolCallingAccuracy > r . baselineToolAccuracy
119
123
) . length ,
120
- evalsRegressed : responses . filter (
124
+ responsesRegressed : responses . filter (
121
125
( r ) => typeof r . baselineToolAccuracy === "number" && r . toolCallingAccuracy < r . baselineToolAccuracy
122
126
) . length ,
123
127
reportGeneratedOn : new Date ( ) . toLocaleString ( ) ,
@@ -172,22 +176,74 @@ async function generateHtmlReport(
172
176
accuracyRunStatus : formatRunStatus ( comparableResult . runStatus ) ,
173
177
reportGeneratedOn : testSummary . reportGeneratedOn ,
174
178
createdOn : testSummary . resultCreatedOn ,
175
- totalTests : String ( testSummary . totalPrompts ) ,
176
- modelsCount : String ( testSummary . totalModels ) ,
177
- testsWithZeroAccuracy : String ( testSummary . testsWithZeroAccuracy . length ) ,
179
+ totalPrompts : String ( testSummary . totalPrompts ) ,
180
+ totalModels : String ( testSummary . totalModels ) ,
181
+ responsesWithZeroAccuracy : String ( testSummary . responsesWithZeroAccuracy . length ) ,
178
182
averageAccuracy : formatAccuracy ( testSummary . averageAccuracy ) ,
179
183
baselineCommitSHA : baselineInfo ?. commitSHA || "-" ,
180
184
baselineAccuracyRunId : baselineInfo ?. accuracyRunId || "-" ,
181
185
baselineAccuracyRunStatus : baselineInfo ?. accuracyRunStatus
182
186
? formatRunStatus ( baselineInfo ?. accuracyRunStatus )
183
187
: "-" ,
184
188
baselineCreatedOn : baselineInfo ?. createdOn || "-" ,
185
- evalsImproved : baselineInfo ? String ( testSummary . evalsImproved ) : "-" ,
186
- evalsRegressed : baselineInfo ? String ( testSummary . evalsRegressed ) : "-" ,
189
+ responsesImproved : baselineInfo ? String ( testSummary . responsesImproved ) : "-" ,
190
+ responsesRegressed : baselineInfo ? String ( testSummary . responsesRegressed ) : "-" ,
187
191
tableRows,
188
192
} ) ;
189
193
}
190
194
195
+ function generateMarkdownBrief (
196
+ comparableResult : ComparableAccuracyResult ,
197
+ testSummary : ReturnType < typeof getTestSummary > ,
198
+ baselineInfo : BaselineRunInfo | null
199
+ ) : string {
200
+ const markdownTexts = [
201
+ "# 📊 Accuracy Test Results" ,
202
+ "## 📈 Summary" ,
203
+ "| Metric | Value |" ,
204
+ "|--------|-------|" ,
205
+ `| **Commit SHA** | \`${ comparableResult . commitSHA } \` |` ,
206
+ `| **Run ID** | \`${ comparableResult . runId } \` |` ,
207
+ `| **Status** | ${ comparableResult . runStatus } |` ,
208
+ `| **Total Prompts Evaluated** | ${ testSummary . totalPrompts } |` ,
209
+ `| **Models Tested** | ${ testSummary . totalModels } |` ,
210
+ `| **Average Accuracy** | ${ formatAccuracy ( testSummary . averageAccuracy ) } |` ,
211
+ `| **Responses with 0% Accuracy** | ${ testSummary . responsesWithZeroAccuracy . length } |` ,
212
+ `| **Responses with 75% Accuracy** | ${ testSummary . responsesWith75Accuracy . length } |` ,
213
+ `| **Responses with 100% Accuracy** | ${ testSummary . responsesWith100Accuracy . length } |` ,
214
+ "" ,
215
+ ] ;
216
+
217
+ if ( baselineInfo ) {
218
+ markdownTexts . push (
219
+ ...[
220
+ "## 📊 Baseline Comparison" ,
221
+ "|--------|-------|" ,
222
+ `| **Baseline Commit** | \`${ baselineInfo . commitSHA } \` |` ,
223
+ `| **Baseline Run ID** | \`${ baselineInfo . accuracyRunId } \` |` ,
224
+ `| **Baseline Run Status** | \`${ baselineInfo . accuracyRunStatus } \` |` ,
225
+ `| **Responses Improved** | ${ testSummary . responsesImproved } |` ,
226
+ `| **Responses Regressed** | ${ testSummary . responsesRegressed } |` ,
227
+ "" ,
228
+ ]
229
+ ) ;
230
+ }
231
+
232
+ const { GITHUB_SERVER_URL , GITHUB_REPOSITORY , GITHUB_RUN_ID } = process . env ;
233
+ const githubRunUrl =
234
+ GITHUB_SERVER_URL && GITHUB_REPOSITORY && GITHUB_RUN_ID
235
+ ? `${ GITHUB_SERVER_URL } /${ GITHUB_REPOSITORY } /actions/runs/${ GITHUB_RUN_ID } `
236
+ : null ;
237
+
238
+ const reportLinkText = githubRunUrl
239
+ ? `📎 **[Download Full HTML Report](${ githubRunUrl } )** - Look for the \`accuracy-test-summary\` artifact for detailed results.`
240
+ : `📎 **Full HTML Report**: \`${ HTML_TEST_SUMMARY_FILE } \`` ;
241
+
242
+ markdownTexts . push ( ...[ "---" , reportLinkText , "" , `*Report generated on: ${ testSummary . reportGeneratedOn } *` ] ) ;
243
+
244
+ return markdownTexts . join ( "\n" ) ;
245
+ }
246
+
191
247
async function generateTestSummary ( ) {
192
248
const storage = getAccuracyResultStorage ( ) ;
193
249
try {
@@ -244,25 +300,29 @@ async function generateTestSummary() {
244
300
) ,
245
301
} ;
246
302
303
+ // Ensure that our writable path actually exist.
304
+ await mkdir ( path . dirname ( HTML_TEST_SUMMARY_FILE ) , { recursive : true } ) ;
305
+
247
306
console . log ( `\n📊 Generating test summary for accuracy run: ${ accuracyRunId } \n` ) ;
248
307
const testSummary = getTestSummary ( comparableAccuracyResult ) ;
249
- const htmlReport = await generateHtmlReport ( comparableAccuracyResult , testSummary , baselineInfo ) ;
250
308
251
- // Ensure that our writable path actually exist.
252
- await mkdir ( path . dirname ( HTML_TESTS_SUMMARY_FILE ) , { recursive : true } ) ;
253
- await writeFile ( HTML_TESTS_SUMMARY_FILE , htmlReport , "utf8" ) ;
309
+ const htmlReport = await generateHtmlReport ( comparableAccuracyResult , testSummary , baselineInfo ) ;
310
+ await writeFile ( HTML_TEST_SUMMARY_FILE , htmlReport , "utf8" ) ;
311
+ console . log ( `✅ HTML report generated: ${ HTML_TEST_SUMMARY_FILE } ` ) ;
254
312
255
- console . log ( `✅ HTML report generated: ${ HTML_TESTS_SUMMARY_FILE } ` ) ;
313
+ const markdownBrief = generateMarkdownBrief ( comparableAccuracyResult , testSummary , baselineInfo ) ;
314
+ await writeFile ( MARKDOWN_TEST_BRIEF_FILE , markdownBrief , "utf8" ) ;
315
+ console . log ( `✅ Markdown brief generated: ${ MARKDOWN_TEST_BRIEF_FILE } ` ) ;
256
316
257
317
console . log ( `\n📈 Summary:` ) ;
258
318
console . log ( ` Total prompts evaluated: ${ testSummary . totalPrompts } ` ) ;
259
319
console . log ( ` Models tested: ${ testSummary . totalModels } ` ) ;
260
- console . log ( ` Evals with 0% accuracy: ${ testSummary . testsWithZeroAccuracy . length } ` ) ;
320
+ console . log ( ` Responses with 0% accuracy: ${ testSummary . responsesWithZeroAccuracy . length } ` ) ;
261
321
262
322
if ( baselineCommit ) {
263
323
console . log ( ` Baseline commit: ${ baselineCommit } ` ) ;
264
- console . log ( ` Evals improved vs baseline: ${ testSummary . evalsImproved } ` ) ;
265
- console . log ( ` Evals regressed vs baseline: ${ testSummary . evalsRegressed } ` ) ;
324
+ console . log ( ` Responses improved vs baseline: ${ testSummary . responsesImproved } ` ) ;
325
+ console . log ( ` Responses regressed vs baseline: ${ testSummary . responsesRegressed } ` ) ;
266
326
}
267
327
} catch ( error ) {
268
328
console . error ( "Error generating test summary:" , error ) ;
0 commit comments