3131 background : # f8f9fa ;
3232 padding : 20px ;
3333 border-radius : 6px ;
34- margin-bottom : 30 px ;
34+ margin-bottom : 20 px ;
3535 border-left : 4px solid # 00684a ;
3636 }
37+ .header-info : nth-child (3 ) {
38+ border-left-color : # 007bff ;
39+ }
40+ .header-info : nth-child (4 ) {
41+ border-left-color : # 28a745 ;
42+ }
3743 .header-info h2 {
3844 margin-top : 0 ;
45+ margin-bottom : 15px ;
3946 color : # 00684a ;
47+ font-size : 1.2em ;
48+ }
49+ .header-info : nth-child (3 ) h2 {
50+ color : # 007bff ;
51+ }
52+ .header-info : nth-child (4 ) h2 {
53+ color : # 28a745 ;
4054 }
4155 .info-grid {
4256 display : grid;
43- grid-template-columns : repeat (auto-fit, minmax (250 px , 1fr ));
57+ grid-template-columns : repeat (auto-fit, minmax (200 px , 1fr ));
4458 gap : 15px ;
4559 margin-top : 15px ;
4660 }
158172 .accuracy-perfect {
159173 background-color : # d4edda ;
160174 color : # 155724 ;
161- padding : 4 px 8 px ;
162- border-radius : 4 px ;
175+ padding : 2 px 6 px ;
176+ border-radius : 3 px ;
163177 font-weight : bold;
164178 }
165179 .accuracy-good {
166180 background-color : # fff3cd ;
167181 color : # 856404 ;
168- padding : 4 px 8 px ;
169- border-radius : 4 px ;
182+ padding : 2 px 6 px ;
183+ border-radius : 3 px ;
170184 font-weight : bold;
171185 }
172186 .accuracy-poor {
173187 background-color : # f8d7da ;
174188 color : # 721c24 ;
175- padding : 4 px 8 px ;
176- border-radius : 4 px ;
189+ padding : 2 px 6 px ;
190+ border-radius : 3 px ;
177191 font-weight : bold;
178192 }
179193 .tool-call {
215229 min-width : 80px ;
216230 text-align : center;
217231 }
232+ .baseline-accuracy-cell {
233+ width : 8% ;
234+ min-width : 80px ;
235+ text-align : center;
236+ }
237+ .accuracy-comparison {
238+ background : # e9ecef ;
239+ padding : 2px 6px ;
240+ border-radius : 3px ;
241+ font-weight : bold;
242+ }
243+ .accuracy-improved {
244+ background : # d4edda ;
245+ color : # 155724 ;
246+ }
247+ .accuracy-regressed {
248+ background : # f8d7da ;
249+ color : # 721c24 ;
250+ }
251+ .accuracy-same {
252+ background : # e2e3e5 ;
253+ color : # 495057 ;
254+ }
218255 .response-time-cell {
219256 width : 10% ;
220257 min-width : 100px ;
264301 < div class ="container ">
265302 < h1 > 📊 MongoDB MCP Server - Accuracy Test Summary</ h1 >
266303 < div class ="header-info ">
267- < h2 > Run Information & Summary </ h2 >
304+ < h2 > 📊 Current Run Information </ h2 >
268305 < div class ="info-grid ">
269306 < div class ="info-item ">
270307 < div class ="info-label "> Accuracy Run ID</ div >
271308 < div class ="info-value "> {{accuracyRunId}}</ div >
272309 </ div >
273- < div class ="info-item ">
274- < div class ="info-label "> Accuracy Run Status</ div >
275- < div class ="info-value status-{{runStatus}} "> {{runStatusUpper}}</ div >
276- </ div >
277310 < div class ="info-item ">
278311 < div class ="info-label "> Commit SHA</ div >
279312 < div class ="info-value "> {{commitSHA}}</ div >
280313 </ div >
281314 < div class ="info-item ">
282- < div class ="info-label "> Report Generated On</ div >
283- < div class ="info-value "> {{reportGeneratedOn }}</ div >
315+ < div class ="info-label "> Run Created On</ div >
316+ < div class ="info-value "> {{createdOn }}</ div >
284317 </ div >
285318 < div class ="info-item ">
286- < div class ="info-label "> Snapshots Captured On</ div >
287- < div class ="info-value "> {{createdOn }}</ div >
319+ < div class ="info-label "> Report Generated On</ div >
320+ < div class ="info-value "> {{reportGeneratedOn }}</ div >
288321 </ div >
322+ </ div >
323+ </ div >
324+
325+ < div class ="header-info ">
326+ < h2 > 📈 Test Results Summary</ h2 >
327+ < div class ="info-grid ">
289328 < div class ="info-item ">
290329 < div class ="info-label "> Total Prompts Evaluated</ div >
291330 < div class ="info-value "> {{totalTests}}</ div >
@@ -298,6 +337,36 @@ <h2>Run Information & Summary</h2>
298337 < div class ="info-label "> Evals with 0% Accuracy</ div >
299338 < div class ="info-value "> {{testsWithZeroAccuracy}}</ div >
300339 </ div >
340+ < div class ="info-item ">
341+ < div class ="info-label "> Average Accuracy</ div >
342+ < div class ="info-value "> {{averageAccuracy}}</ div >
343+ </ div >
344+ </ div >
345+ </ div >
346+
347+ < div class ="header-info ">
348+ < h2 > 🔄 Baseline Comparison</ h2 >
349+ < div class ="info-grid ">
350+ < div class ="info-item ">
351+ < div class ="info-label "> Baseline Accuracy Run ID</ div >
352+ < div class ="info-value "> {{baselineAccuracyRunId}}</ div >
353+ </ div >
354+ < div class ="info-item ">
355+ < div class ="info-label "> Baseline Commit SHA</ div >
356+ < div class ="info-value "> {{baselineCommitSHA}}</ div >
357+ </ div >
358+ < div class ="info-item ">
359+ < div class ="info-label "> Baseline Run Created On</ div >
360+ < div class ="info-value "> {{baselineCreatedOn}}</ div >
361+ </ div >
362+ < div class ="info-item ">
363+ < div class ="info-label "> Evals Improved vs Baseline</ div >
364+ < div class ="info-value "> {{evalsImproved}}</ div >
365+ </ div >
366+ < div class ="info-item ">
367+ < div class ="info-label "> Evals Regressed vs Baseline</ div >
368+ < div class ="info-value "> {{evalsRegressed}}</ div >
369+ </ div >
301370 </ div >
302371 </ div >
303372 < table >
@@ -308,6 +377,7 @@ <h2>Run Information & Summary</h2>
308377 < th > Expected Tool Calls</ th >
309378 < th > LLM Tool Calls</ th >
310379 < th > Accuracy</ th >
380+ < th > Baseline Accuracy</ th >
311381 < th > LLM Response Time (ms)</ th >
312382 < th > Total Tokens Used</ th >
313383 </ tr >
0 commit comments