Implement token usage and latency comparison charts in eval Reporter (#15)

AmineAfia · web-flow · commit 86cd0bb9cc2c · 2025-10-24T11:03:19.000+02:00
- Added a new `createBarChart` function for generating horizontal bar charts to visualize token usage and latency.
- Updated the `Reporter` class to display token usage and latency comparisons when multiple executors are present.
- Enhanced `TestRunner` and `PromptWithTests` to aggregate token usage data across tests and include it in the final results.
- Improved error handling in `VercelAIExecutor` for usage extraction, supporting both new and legacy formats.
diff --git a/packages/cli/src/output/Reporter.ts b/packages/cli/src/output/Reporter.ts
@@ -5,8 +5,8 @@
 import chalk from "chalk";
 import ora from "ora";
 import type { RunnerResults } from "../runner/TestRunner.js";
-import type { EvalResult } from "@marrakesh/core";
-import { formatDuration, formatDiff } from "./formatters.js";
+import type { EvalResult, ExecutionStep } from "@marrakesh/core";
+import { formatDuration, formatDiff, createBarChart } from "./formatters.js";
 
 /**
  * Extract tool names used from execution steps
@@ -155,25 +155,100 @@ export class Reporter {
 
       console.log();
 
-      // Print executor summary
-      console.log(chalk.bold("Executor Summary:"));
-      for (const executorName of executorNames) {
-        const executorResult = testResults.executorResults[executorName];
-        const total = executorResult.passed + executorResult.failed;
-        const passRate =
-          total > 0
-            ? ((executorResult.passed / total) * 100).toFixed(1)
-            : "0.0";
-        const color = executorResult.failed === 0 ? chalk.green : chalk.red;
+      if (executorNames.length > 1) {
+        // Display token usage bar chart for multi-executor comparison
+        console.log(chalk.bold("Token Usage Comparison:"));
+
+        // Prepare data for bar chart with input/output token breakdown
+        const tokenData = executorNames.map((name) => {
+          const executorResult = testResults.executorResults?.[name];
+          if (!executorResult) {
+            return {
+              label: name,
+              value: 0,
+              color: "gray",
+              formatValue: (value: number) =>
+                `${value.toLocaleString()} (0 in / 0 out)`,
+            };
+          }
 
-        console.log(
-          color(
-            `  ${executorName}: ${executorResult.passed}/${total} passed (${passRate}%)`,
-          ),
-        );
-      }
+          const executorUsage = executorResult.results.reduce(
+            (
+              acc: {
+                totalTokens: number;
+                promptTokens: number;
+                completionTokens: number;
+              },
+              result: EvalResult,
+            ) => {
+              if (result.usage) {
+                acc.totalTokens += result.usage.totalTokens;
+                acc.promptTokens += result.usage.promptTokens;
+                acc.completionTokens += result.usage.completionTokens;
+              }
+              return acc;
+            },
+            { totalTokens: 0, promptTokens: 0, completionTokens: 0 },
+          );
 
-      console.log();
+          // Use red color if there are failed tests, gray otherwise
+          const hasFailures = executorResult.failed > 0;
+          const color = hasFailures ? "red" : "gray";
+
+          return {
+            label: name,
+            value: executorUsage.totalTokens,
+            color,
+            formatValue: (value: number) =>
+              `${value.toLocaleString()} (${executorUsage.promptTokens} in / ${executorUsage.completionTokens} out)`,
+          };
+        });
+
+        // Display bar chart
+        const chart = createBarChart(tokenData, 40);
+        for (const line of chart) {
+          console.log(line);
+        }
+
+        // Add latency comparison chart
+        console.log();
+        console.log(chalk.bold("Latency Comparison:"));
+        const latencyData = executorNames.map((name) => {
+          const executorResult = testResults.executorResults?.[name];
+          if (!executorResult) {
+            return { label: name, value: 0, color: "gray" };
+          }
+
+          // Calculate average latency for this executor
+          const totalDuration = executorResult.results.reduce(
+            (acc: number, result: EvalResult) => {
+              return acc + result.duration;
+            },
+            0,
+          );
+          const averageLatency =
+            executorResult.results.length > 0
+              ? totalDuration / executorResult.results.length
+              : 0;
+
+          // Use red color if there are failed tests, white otherwise
+          const hasFailures = executorResult.failed > 0;
+          const color = hasFailures ? "red" : "white";
+
+          return {
+            label: name,
+            value: averageLatency,
+            color,
+            formatValue: (value: number) => formatDuration(value),
+          };
+        });
+
+        const latencyChart = createBarChart(latencyData, 40);
+        for (const line of latencyChart) {
+          console.log(line);
+        }
+        console.log();
+      }
     }
 
     // Print overall summary
diff --git a/packages/cli/src/output/formatters.ts b/packages/cli/src/output/formatters.ts
@@ -2,6 +2,7 @@
  * Formatters - Utility functions for formatting output
  */
 
+import chalk from "chalk";
 import type { TestResults } from "@marrakesh/core";
 
 /**
@@ -58,3 +59,36 @@ export function formatError(error: Error): string {
 
   return output;
 }
+
+/**
+ * Generate a horizontal bar chart for comparison
+ * @param data - Array of { label, value, color, formatValue? } objects
+ * @param maxWidth - Maximum width of the bar in characters
+ * @returns Formatted bar chart string
+ */
+export function createBarChart(
+  data: Array<{
+    label: string;
+    value: number;
+    color?: string;
+    formatValue?: (value: number) => string;
+  }>,
+  maxWidth = 40,
+): string[] {
+  const maxValue = Math.max(...data.map((d) => d.value));
+  if (maxValue === 0) return [];
+
+  return data.map(({ label, value, color, formatValue }) => {
+    const barLength = Math.round((value / maxValue) * maxWidth);
+    // Use gradient character (▓) for 3D effect instead of solid block (█)
+    const bar = "▓".repeat(barLength);
+    const spaces = " ".repeat(maxWidth - barLength);
+    const percentage = ((value / maxValue) * 100).toFixed(0);
+
+    const coloredBar = color ? chalk[color](bar) : bar;
+    const displayValue = formatValue
+      ? formatValue(value)
+      : value.toLocaleString();
+    return `  ${label.padEnd(15)} ${coloredBar}${spaces} ${displayValue} (${percentage}%)`;
+  });
+}
diff --git a/packages/cli/src/runner/TestRunner.ts b/packages/cli/src/runner/TestRunner.ts
@@ -19,6 +19,12 @@ export interface RunnerResults {
     promptName: string;
     results: TestResults;
   }>;
+  /** Aggregated token usage across all tests */
+  totalUsage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
 }
 
 /**
@@ -183,12 +189,27 @@ export class TestRunner {
     const passed = promptResults.reduce((sum, r) => sum + r.results.passed, 0);
     const failed = promptResults.reduce((sum, r) => sum + r.results.failed, 0);
 
+    // Calculate total usage across all prompt results
+    const totalUsage = promptResults.reduce(
+      (acc, promptResult) => {
+        if (promptResult.results.totalUsage) {
+          acc.promptTokens += promptResult.results.totalUsage.promptTokens;
+          acc.completionTokens +=
+            promptResult.results.totalUsage.completionTokens;
+          acc.totalTokens += promptResult.results.totalUsage.totalTokens;
+        }
+        return acc;
+      },
+      { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
+    );
+
     const finalResults = {
       total,
       passed,
       failed,
       duration: Date.now() - startTime,
       promptResults,
+      totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
     };
 
     return finalResults;
diff --git a/packages/core/src/executors/VercelAIExecutor.ts b/packages/core/src/executors/VercelAIExecutor.ts
@@ -143,14 +143,37 @@ export function createVercelAIExecutor(config: ExecutorConfig): Executor {
         output = resultObj.text || "";
       }
 
+      // Extract usage information with better error handling
+      let usage: ExecutionResult["usage"] | undefined;
+
+      if (resultObj.usage) {
+        const usageObj = resultObj.usage as Record<string, unknown>;
+
+        // Support both AI SDK 5.0 format (inputTokens/outputTokens) and legacy format (promptTokens/completionTokens)
+        usage = {
+          promptTokens:
+            typeof usageObj.inputTokens === "number"
+              ? usageObj.inputTokens
+              : typeof usageObj.promptTokens === "number"
+                ? usageObj.promptTokens
+                : 0,
+          completionTokens:
+            typeof usageObj.outputTokens === "number"
+              ? usageObj.outputTokens
+              : typeof usageObj.completionTokens === "number"
+                ? usageObj.completionTokens
+                : 0,
+          totalTokens:
+            typeof usageObj.totalTokens === "number" ? usageObj.totalTokens : 0,
+        };
+      }
+
       return {
         output,
         steps,
         finishReason: (result as { finishReason: string })
           .finishReason as ExecutionResult["finishReason"],
-        usage: (result as { usage?: unknown }).usage as
-          | ExecutionResult["usage"]
-          | undefined,
+        usage,
       };
     } catch (error) {
       // Handle timeout or other errors
diff --git a/packages/core/src/testing/PromptWithTests.ts b/packages/core/src/testing/PromptWithTests.ts
@@ -174,13 +174,27 @@ export class PromptWithTests {
       }
     }
 
+    // Calculate total usage across all results
+    const totalUsage = allResults.reduce(
+      (acc, result) => {
+        if (result.usage) {
+          acc.promptTokens += result.usage.promptTokens;
+          acc.completionTokens += result.usage.completionTokens;
+          acc.totalTokens += result.usage.totalTokens;
+        }
+        return acc;
+      },
+      { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
+    );
+
     const finalResults: TestResults = {
       total: allResults.length,
       passed: allResults.filter((r) => r.passed).length,
       failed: allResults.filter((r) => !r.passed).length,
       duration: Date.now() - startTime,
       results: allResults,
       executorResults,
+      totalUsage: totalUsage.totalTokens > 0 ? totalUsage : undefined,
     };
 
     // Track test run (fire-and-forget, non-blocking)
@@ -234,6 +248,7 @@ export class PromptWithTests {
           error: executionResult.error,
           expected: testCase.expect,
           steps: executionResult.steps,
+          usage: executionResult.usage,
           executor: executorMetadata,
         };
       }
@@ -252,6 +267,7 @@ export class PromptWithTests {
         execution_id: executionId,
         expected: testCase.expect,
         steps: executionResult.steps,
+        usage: executionResult.usage,
         executor: executorMetadata,
       };
     } catch (error) {
diff --git a/packages/core/src/testing/types.ts b/packages/core/src/testing/types.ts
@@ -62,6 +62,12 @@ export interface EvalResult {
   expected?: unknown;
   /** Execution steps including tool calls */
   steps?: ExecutionStep[];
+  /** Token usage information */
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
   /** Executor metadata (model and config) */
   executor?: ExecutorMetadata;
 }
@@ -89,6 +95,12 @@ export interface TestResults {
       results: EvalResult[];
     }
   >;
+  /** Aggregated token usage across all tests */
+  totalUsage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
 }
 
 /**