Skip to content

Commit 86cd0bb

Browse files
authored
Implement token usage and latency comparison charts in eval Reporter (#15)
- Added a new `createBarChart` function for generating horizontal bar charts to visualize token usage and latency. - Updated the `Reporter` class to display token usage and latency comparisons when multiple executors are present. - Enhanced `TestRunner` and `PromptWithTests` to aggregate token usage data across tests and include it in the final results. - Improved error handling in `VercelAIExecutor` for usage extraction, supporting both new and legacy formats.
1 parent 75f0ba1 commit 86cd0bb

File tree

6 files changed

+203
-22
lines changed

6 files changed

+203
-22
lines changed

packages/cli/src/output/Reporter.ts

Lines changed: 94 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
import chalk from "chalk";
66
import ora from "ora";
77
import type { RunnerResults } from "../runner/TestRunner.js";
8-
import type { EvalResult } from "@marrakesh/core";
9-
import { formatDuration, formatDiff } from "./formatters.js";
8+
import type { EvalResult, ExecutionStep } from "@marrakesh/core";
9+
import { formatDuration, formatDiff, createBarChart } from "./formatters.js";
1010

1111
/**
1212
* Extract tool names used from execution steps
@@ -155,25 +155,100 @@ export class Reporter {
155155

156156
console.log();
157157

158-
// Print executor summary
159-
console.log(chalk.bold("Executor Summary:"));
160-
for (const executorName of executorNames) {
161-
const executorResult = testResults.executorResults[executorName];
162-
const total = executorResult.passed + executorResult.failed;
163-
const passRate =
164-
total > 0
165-
? ((executorResult.passed / total) * 100).toFixed(1)
166-
: "0.0";
167-
const color = executorResult.failed === 0 ? chalk.green : chalk.red;
158+
if (executorNames.length > 1) {
159+
// Display token usage bar chart for multi-executor comparison
160+
console.log(chalk.bold("Token Usage Comparison:"));
161+
162+
// Prepare data for bar chart with input/output token breakdown
163+
const tokenData = executorNames.map((name) => {
164+
const executorResult = testResults.executorResults?.[name];
165+
if (!executorResult) {
166+
return {
167+
label: name,
168+
value: 0,
169+
color: "gray",
170+
formatValue: (value: number) =>
171+
`${value.toLocaleString()} (0 in / 0 out)`,
172+
};
173+
}
168174

169-
console.log(
170-
color(
171-
` ${executorName}: ${executorResult.passed}/${total} passed (${passRate}%)`,
172-
),
173-
);
174-
}
175+
const executorUsage = executorResult.results.reduce(
176+
(
177+
acc: {
178+
totalTokens: number;
179+
promptTokens: number;
180+
completionTokens: number;
181+
},
182+
result: EvalResult,
183+
) => {
184+
if (result.usage) {
185+
acc.totalTokens += result.usage.totalTokens;
186+
acc.promptTokens += result.usage.promptTokens;
187+
acc.completionTokens += result.usage.completionTokens;
188+
}
189+
return acc;
190+
},
191+
{ totalTokens: 0, promptTokens: 0, completionTokens: 0 },
192+
);
175193

176-
console.log();
194+
// Use red color if there are failed tests, gray otherwise
195+
const hasFailures = executorResult.failed > 0;
196+
const color = hasFailures ? "red" : "gray";
197+
198+
return {
199+
label: name,
200+
value: executorUsage.totalTokens,
201+
color,
202+
formatValue: (value: number) =>
203+
`${value.toLocaleString()} (${executorUsage.promptTokens} in / ${executorUsage.completionTokens} out)`,
204+
};
205+
});
206+
207+
// Display bar chart
208+
const chart = createBarChart(tokenData, 40);
209+
for (const line of chart) {
210+
console.log(line);
211+
}
212+
213+
// Add latency comparison chart
214+
console.log();
215+
console.log(chalk.bold("Latency Comparison:"));
216+
const latencyData = executorNames.map((name) => {
217+
const executorResult = testResults.executorResults?.[name];
218+
if (!executorResult) {
219+
return { label: name, value: 0, color: "gray" };
220+
}
221+
222+
// Calculate average latency for this executor
223+
const totalDuration = executorResult.results.reduce(
224+
(acc: number, result: EvalResult) => {
225+
return acc + result.duration;
226+
},
227+
0,
228+
);
229+
const averageLatency =
230+
executorResult.results.length > 0
231+
? totalDuration / executorResult.results.length
232+
: 0;
233+
234+
// Use red color if there are failed tests, white otherwise
235+
const hasFailures = executorResult.failed > 0;
236+
const color = hasFailures ? "red" : "white";
237+
238+
return {
239+
label: name,
240+
value: averageLatency,
241+
color,
242+
formatValue: (value: number) => formatDuration(value),
243+
};
244+
});
245+
246+
const latencyChart = createBarChart(latencyData, 40);
247+
for (const line of latencyChart) {
248+
console.log(line);
249+
}
250+
console.log();
251+
}
177252
}
178253

179254
// Print overall summary

packages/cli/src/output/formatters.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Formatters - Utility functions for formatting output
33
*/
44

5+
import chalk from "chalk";
56
import type { TestResults } from "@marrakesh/core";
67

78
/**
@@ -58,3 +59,36 @@ export function formatError(error: Error): string {
5859

5960
return output;
6061
}
62+
63+
/**
64+
* Generate a horizontal bar chart for comparison
65+
* @param data - Array of { label, value, color, formatValue? } objects
66+
* @param maxWidth - Maximum width of the bar in characters
67+
* @returns Formatted bar chart string
68+
*/
69+
export function createBarChart(
70+
data: Array<{
71+
label: string;
72+
value: number;
73+
color?: string;
74+
formatValue?: (value: number) => string;
75+
}>,
76+
maxWidth = 40,
77+
): string[] {
78+
const maxValue = Math.max(...data.map((d) => d.value));
79+
if (maxValue === 0) return [];
80+
81+
return data.map(({ label, value, color, formatValue }) => {
82+
const barLength = Math.round((value / maxValue) * maxWidth);
83+
// Use gradient character (▓) for 3D effect instead of solid block (█)
84+
const bar = "▓".repeat(barLength);
85+
const spaces = " ".repeat(maxWidth - barLength);
86+
const percentage = ((value / maxValue) * 100).toFixed(0);
87+
88+
const coloredBar = color ? chalk[color](bar) : bar;
89+
const displayValue = formatValue
90+
? formatValue(value)
91+
: value.toLocaleString();
92+
return ` ${label.padEnd(15)} ${coloredBar}${spaces} ${displayValue} (${percentage}%)`;
93+
});
94+
}

packages/cli/src/runner/TestRunner.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ export interface RunnerResults {
1919
promptName: string;
2020
results: TestResults;
2121
}>;
22+
/** Aggregated token usage across all tests */
23+
totalUsage?: {
24+
promptTokens: number;
25+
completionTokens: number;
26+
totalTokens: number;
27+
};
2228
}
2329

2430
/**
@@ -183,12 +189,27 @@ export class TestRunner {
183189
const passed = promptResults.reduce((sum, r) => sum + r.results.passed, 0);
184190
const failed = promptResults.reduce((sum, r) => sum + r.results.failed, 0);
185191

192+
// Calculate total usage across all prompt results
193+
const totalUsage = promptResults.reduce(
194+
(acc, promptResult) => {
195+
if (promptResult.results.totalUsage) {
196+
acc.promptTokens += promptResult.results.totalUsage.promptTokens;
197+
acc.completionTokens +=
198+
promptResult.results.totalUsage.completionTokens;
199+
acc.totalTokens += promptResult.results.totalUsage.totalTokens;
200+
}
201+
return acc;
202+
},
203+
{ promptTokens: 0, completionTokens: 0, totalTokens: 0 },
204+
);
205+
186206
const finalResults = {
187207
total,
188208
passed,
189209
failed,
190210
duration: Date.now() - startTime,
191211
promptResults,
212+
totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
192213
};
193214

194215
return finalResults;

packages/core/src/executors/VercelAIExecutor.ts

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,37 @@ export function createVercelAIExecutor(config: ExecutorConfig): Executor {
143143
output = resultObj.text || "";
144144
}
145145

146+
// Extract usage information with better error handling
147+
let usage: ExecutionResult["usage"] | undefined;
148+
149+
if (resultObj.usage) {
150+
const usageObj = resultObj.usage as Record<string, unknown>;
151+
152+
// Support both AI SDK 5.0 format (inputTokens/outputTokens) and legacy format (promptTokens/completionTokens)
153+
usage = {
154+
promptTokens:
155+
typeof usageObj.inputTokens === "number"
156+
? usageObj.inputTokens
157+
: typeof usageObj.promptTokens === "number"
158+
? usageObj.promptTokens
159+
: 0,
160+
completionTokens:
161+
typeof usageObj.outputTokens === "number"
162+
? usageObj.outputTokens
163+
: typeof usageObj.completionTokens === "number"
164+
? usageObj.completionTokens
165+
: 0,
166+
totalTokens:
167+
typeof usageObj.totalTokens === "number" ? usageObj.totalTokens : 0,
168+
};
169+
}
170+
146171
return {
147172
output,
148173
steps,
149174
finishReason: (result as { finishReason: string })
150175
.finishReason as ExecutionResult["finishReason"],
151-
usage: (result as { usage?: unknown }).usage as
152-
| ExecutionResult["usage"]
153-
| undefined,
176+
usage,
154177
};
155178
} catch (error) {
156179
// Handle timeout or other errors

packages/core/src/testing/PromptWithTests.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,27 @@ export class PromptWithTests {
174174
}
175175
}
176176

177+
// Calculate total usage across all results
178+
const totalUsage = allResults.reduce(
179+
(acc, result) => {
180+
if (result.usage) {
181+
acc.promptTokens += result.usage.promptTokens;
182+
acc.completionTokens += result.usage.completionTokens;
183+
acc.totalTokens += result.usage.totalTokens;
184+
}
185+
return acc;
186+
},
187+
{ promptTokens: 0, completionTokens: 0, totalTokens: 0 },
188+
);
189+
177190
const finalResults: TestResults = {
178191
total: allResults.length,
179192
passed: allResults.filter((r) => r.passed).length,
180193
failed: allResults.filter((r) => !r.passed).length,
181194
duration: Date.now() - startTime,
182195
results: allResults,
183196
executorResults,
197+
totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
184198
};
185199

186200
// Track test run (fire-and-forget, non-blocking)
@@ -234,6 +248,7 @@ export class PromptWithTests {
234248
error: executionResult.error,
235249
expected: testCase.expect,
236250
steps: executionResult.steps,
251+
usage: executionResult.usage,
237252
executor: executorMetadata,
238253
};
239254
}
@@ -252,6 +267,7 @@ export class PromptWithTests {
252267
execution_id: executionId,
253268
expected: testCase.expect,
254269
steps: executionResult.steps,
270+
usage: executionResult.usage,
255271
executor: executorMetadata,
256272
};
257273
} catch (error) {

packages/core/src/testing/types.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ export interface EvalResult {
6262
expected?: unknown;
6363
/** Execution steps including tool calls */
6464
steps?: ExecutionStep[];
65+
/** Token usage information */
66+
usage?: {
67+
promptTokens: number;
68+
completionTokens: number;
69+
totalTokens: number;
70+
};
6571
/** Executor metadata (model and config) */
6672
executor?: ExecutorMetadata;
6773
}
@@ -89,6 +95,12 @@ export interface TestResults {
8995
results: EvalResult[];
9096
}
9197
>;
98+
/** Aggregated token usage across all tests */
99+
totalUsage?: {
100+
promptTokens: number;
101+
completionTokens: number;
102+
totalTokens: number;
103+
};
92104
}
93105

94106
/**

0 commit comments

Comments
 (0)