Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 94 additions & 19 deletions packages/cli/src/output/Reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import chalk from "chalk";
import ora from "ora";
import type { RunnerResults } from "../runner/TestRunner.js";
import type { EvalResult } from "@marrakesh/core";
import { formatDuration, formatDiff } from "./formatters.js";
import type { EvalResult, ExecutionStep } from "@marrakesh/core";
import { formatDuration, formatDiff, createBarChart } from "./formatters.js";

/**
* Extract tool names used from execution steps
Expand Down Expand Up @@ -155,25 +155,100 @@ export class Reporter {

console.log();

// Print executor summary
console.log(chalk.bold("Executor Summary:"));
for (const executorName of executorNames) {
const executorResult = testResults.executorResults[executorName];
const total = executorResult.passed + executorResult.failed;
const passRate =
total > 0
? ((executorResult.passed / total) * 100).toFixed(1)
: "0.0";
const color = executorResult.failed === 0 ? chalk.green : chalk.red;
if (executorNames.length > 1) {
// Display token usage bar chart for multi-executor comparison
console.log(chalk.bold("Token Usage Comparison:"));

// Prepare data for bar chart with input/output token breakdown
const tokenData = executorNames.map((name) => {
const executorResult = testResults.executorResults?.[name];
if (!executorResult) {
return {
label: name,
value: 0,
color: "gray",
formatValue: (value: number) =>
`${value.toLocaleString()} (0 in / 0 out)`,
};
}

console.log(
color(
` ${executorName}: ${executorResult.passed}/${total} passed (${passRate}%)`,
),
);
}
const executorUsage = executorResult.results.reduce(
(
acc: {
totalTokens: number;
promptTokens: number;
completionTokens: number;
},
result: EvalResult,
) => {
if (result.usage) {
acc.totalTokens += result.usage.totalTokens;
acc.promptTokens += result.usage.promptTokens;
acc.completionTokens += result.usage.completionTokens;
}
return acc;
},
{ totalTokens: 0, promptTokens: 0, completionTokens: 0 },
);

console.log();
// Use red color if there are failed tests, gray otherwise
const hasFailures = executorResult.failed > 0;
const color = hasFailures ? "red" : "gray";

return {
label: name,
value: executorUsage.totalTokens,
color,
formatValue: (value: number) =>
`${value.toLocaleString()} (${executorUsage.promptTokens} in / ${executorUsage.completionTokens} out)`,
};
});

// Display bar chart
const chart = createBarChart(tokenData, 40);
for (const line of chart) {
console.log(line);
}

// Add latency comparison chart
console.log();
console.log(chalk.bold("Latency Comparison:"));
const latencyData = executorNames.map((name) => {
const executorResult = testResults.executorResults?.[name];
if (!executorResult) {
return { label: name, value: 0, color: "gray" };
}

// Calculate average latency for this executor
const totalDuration = executorResult.results.reduce(
(acc: number, result: EvalResult) => {
return acc + result.duration;
},
0,
);
const averageLatency =
executorResult.results.length > 0
? totalDuration / executorResult.results.length
: 0;

// Use red color if there are failed tests, white otherwise
const hasFailures = executorResult.failed > 0;
const color = hasFailures ? "red" : "white";

return {
label: name,
value: averageLatency,
color,
formatValue: (value: number) => formatDuration(value),
};
});

const latencyChart = createBarChart(latencyData, 40);
for (const line of latencyChart) {
console.log(line);
}
console.log();
}
}

// Print overall summary
Expand Down
34 changes: 34 additions & 0 deletions packages/cli/src/output/formatters.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Formatters - Utility functions for formatting output
*/

import chalk from "chalk";
import type { TestResults } from "@marrakesh/core";

/**
Expand Down Expand Up @@ -58,3 +59,36 @@ export function formatError(error: Error): string {

return output;
}

/**
* Generate a horizontal bar chart for comparison
* @param data - Array of { label, value, color, formatValue? } objects
* @param maxWidth - Maximum width of the bar in characters
* @returns Formatted bar chart string
*/
export function createBarChart(
data: Array<{
label: string;
value: number;
color?: string;
formatValue?: (value: number) => string;
}>,
maxWidth = 40,
): string[] {
const maxValue = Math.max(...data.map((d) => d.value));
if (maxValue === 0) return [];

return data.map(({ label, value, color, formatValue }) => {
const barLength = Math.round((value / maxValue) * maxWidth);
// Use gradient character (▓) for 3D effect instead of solid block (█)
const bar = "▓".repeat(barLength);
const spaces = " ".repeat(maxWidth - barLength);
const percentage = ((value / maxValue) * 100).toFixed(0);

const coloredBar = color ? chalk[color](bar) : bar;
const displayValue = formatValue
? formatValue(value)
: value.toLocaleString();
return ` ${label.padEnd(15)} ${coloredBar}${spaces} ${displayValue} (${percentage}%)`;
});
}
21 changes: 21 additions & 0 deletions packages/cli/src/runner/TestRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ export interface RunnerResults {
promptName: string;
results: TestResults;
}>;
/** Aggregated token usage across all tests */
totalUsage?: {
promptTokens: number;
completionTokens: number;
totalTokens: number;
};
}

/**
Expand Down Expand Up @@ -183,12 +189,27 @@ export class TestRunner {
const passed = promptResults.reduce((sum, r) => sum + r.results.passed, 0);
const failed = promptResults.reduce((sum, r) => sum + r.results.failed, 0);

// Calculate total usage across all prompt results
const totalUsage = promptResults.reduce(
(acc, promptResult) => {
if (promptResult.results.totalUsage) {
acc.promptTokens += promptResult.results.totalUsage.promptTokens;
acc.completionTokens +=
promptResult.results.totalUsage.completionTokens;
acc.totalTokens += promptResult.results.totalUsage.totalTokens;
}
return acc;
},
{ promptTokens: 0, completionTokens: 0, totalTokens: 0 },
);

const finalResults = {
total,
passed,
failed,
duration: Date.now() - startTime,
promptResults,
totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
};

return finalResults;
Expand Down
29 changes: 26 additions & 3 deletions packages/core/src/executors/VercelAIExecutor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,37 @@ export function createVercelAIExecutor(config: ExecutorConfig): Executor {
output = resultObj.text || "";
}

// Extract usage information with better error handling
let usage: ExecutionResult["usage"] | undefined;

if (resultObj.usage) {
const usageObj = resultObj.usage as Record<string, unknown>;

// Support both AI SDK 5.0 format (inputTokens/outputTokens) and legacy format (promptTokens/completionTokens)
usage = {
promptTokens:
typeof usageObj.inputTokens === "number"
? usageObj.inputTokens
: typeof usageObj.promptTokens === "number"
? usageObj.promptTokens
: 0,
completionTokens:
typeof usageObj.outputTokens === "number"
? usageObj.outputTokens
: typeof usageObj.completionTokens === "number"
? usageObj.completionTokens
: 0,
totalTokens:
typeof usageObj.totalTokens === "number" ? usageObj.totalTokens : 0,
};
}

return {
output,
steps,
finishReason: (result as { finishReason: string })
.finishReason as ExecutionResult["finishReason"],
usage: (result as { usage?: unknown }).usage as
| ExecutionResult["usage"]
| undefined,
usage,
};
} catch (error) {
// Handle timeout or other errors
Expand Down
16 changes: 16 additions & 0 deletions packages/core/src/testing/PromptWithTests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,27 @@ export class PromptWithTests {
}
}

// Calculate total usage across all results
const totalUsage = allResults.reduce(
(acc, result) => {
if (result.usage) {
acc.promptTokens += result.usage.promptTokens;
acc.completionTokens += result.usage.completionTokens;
acc.totalTokens += result.usage.totalTokens;
}
return acc;
},
{ promptTokens: 0, completionTokens: 0, totalTokens: 0 },
);

const finalResults: TestResults = {
total: allResults.length,
passed: allResults.filter((r) => r.passed).length,
failed: allResults.filter((r) => !r.passed).length,
duration: Date.now() - startTime,
results: allResults,
executorResults,
totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
};

// Track test run (fire-and-forget, non-blocking)
Expand Down Expand Up @@ -234,6 +248,7 @@ export class PromptWithTests {
error: executionResult.error,
expected: testCase.expect,
steps: executionResult.steps,
usage: executionResult.usage,
executor: executorMetadata,
};
}
Expand All @@ -252,6 +267,7 @@ export class PromptWithTests {
execution_id: executionId,
expected: testCase.expect,
steps: executionResult.steps,
usage: executionResult.usage,
executor: executorMetadata,
};
} catch (error) {
Expand Down
12 changes: 12 additions & 0 deletions packages/core/src/testing/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ export interface EvalResult {
expected?: unknown;
/** Execution steps including tool calls */
steps?: ExecutionStep[];
/** Token usage information */
usage?: {
promptTokens: number;
completionTokens: number;
totalTokens: number;
};
/** Executor metadata (model and config) */
executor?: ExecutorMetadata;
}
Expand Down Expand Up @@ -89,6 +95,12 @@ export interface TestResults {
results: EvalResult[];
}
>;
/** Aggregated token usage across all tests */
totalUsage?: {
promptTokens: number;
completionTokens: number;
totalTokens: number;
};
}

/**
Expand Down