From 759f68c30410c940129182d9e6bad8c2c37618e6 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Fri, 4 Jul 2025 15:42:49 +0000 Subject: [PATCH 1/3] feat: Swap to uv native build backend and add upgrade target in Makefile --- Makefile | 3 +++ pyproject.toml | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 477824e..a47e39f 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,9 @@ install: uv sync npm install -g @mermaid-js/mermaid-cli +upgrade: + uv sync -U + lint: uv run ruff check . diff --git a/pyproject.toml b/pyproject.toml index 57b6f77..5b5c0cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,10 @@ dependencies = [ "streamlit>=1.45.1", ] -[tool.hatch.build.targets.wheel] -packages = ["agents_mcp_usage"] +[tool.uv.build-backend] +module-name = "agents_mcp_usage" +module-root = "" [build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" +requires = ["uv_build>=0.7.19,<0.8.0"] +build-backend = "uv_build" From 5812b0ebdc6c8bb60cc900dd4e696284d22fe4b9 Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Sun, 6 Jul 2025 07:34:27 +0000 Subject: [PATCH 2/3] chore: Update costs data --- .../evaluations/mermaid_evals/costs.json | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/agents_mcp_usage/evaluations/mermaid_evals/costs.json b/agents_mcp_usage/evaluations/mermaid_evals/costs.json index b3f7e0a..cea4117 100644 --- a/agents_mcp_usage/evaluations/mermaid_evals/costs.json +++ b/agents_mcp_usage/evaluations/mermaid_evals/costs.json @@ -195,6 +195,29 @@ ] } }, + "gemini-2.5-flash-preview-05-20": { + "friendly_name": "Gemini 2.5 Flash Preview (May)", + "input": [ + { + "up_to": "inf", + "price": 0.15 + } + ], + "output": { + "non_thinking": [ + { + "up_to": "inf", + "price": 0.6 + } + ], + "thinking": [ + { + "up_to": "inf", + "price": 3.5 + } + ] + } + }, "gemini-2.5-flash-preview": { "friendly_name": "Gemini 2.5 Flash Preview", "input": [ @@ -429,6 +452,74 @@ } ] } + }, + "bedrock:us.amazon.nova-micro-v1:0": { + "friendly_name": "Amazon Nova Micro", + "input": [ + { + "up_to": "inf", + "price": 0.035 + } + ], + "output": { + "default": [ + { + "up_to": "inf", + "price": 0.14 + } + ] + } + }, + "bedrock:us.amazon.nova-lite-v1:0": { + "friendly_name": "Amazon Nova Lite", + "input": [ + { + "up_to": "inf", + "price": 0.06 + } + ], + "output": { + "default": [ + { + "up_to": "inf", + "price": 0.24 + } + ] + } + }, + "bedrock:us.amazon.nova-pro-v1:0": { + "friendly_name": "Amazon Nova Pro", + "input": [ + { + "up_to": "inf", + "price": 0.80 + } + ], + "output": { + "default": [ + { + "up_to": "inf", + "price": 3.20 + } + ] + } + }, + "bedrock:us.amazon.nova-premier-v1:0": { + "friendly_name": "Amazon Nova Premier", + "input": [ + { + "up_to": "inf", + "price": 2.50 + } + ], + "output": { + "default": [ + { + "up_to": "inf", + "price": 12.50 + } + ] + } } } } \ No newline at end of file From 5c30203f1e39ffc1e1475e01e40b9d4c43ea564b Mon Sep 17 00:00:00 2001 From: Andrew Ginns Date: Sun, 6 Jul 2025 10:29:31 +0000 Subject: [PATCH 3/3] feat: Costs added to output json to remove runtime calculations --- .../mermaid_evals/results/docs/README.md | 105 +++++ .../docs/data_relationships_quickref.md | 143 ++++++ .../results/docs/output_json_schema.md | 285 ++++++++++++ .../results/docs/visualization_example.ts | 430 ++++++++++++++++++ .../scripts/preprocess_merbench_data.py | 150 +++++- 5 files changed, 1109 insertions(+), 4 deletions(-) create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md new file mode 100644 index 0000000..75d45c2 --- /dev/null +++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md @@ -0,0 +1,105 @@ +# Merbench Visualization Documentation + +This directory contains comprehensive documentation for understanding and using the Merbench evaluation output data to build visualizations. + +## Documentation Files + +### 1. [output_json_schema.md](./output_json_schema.md) +**Complete schema reference for the preprocessed JSON output** +- Detailed description of all data sections +- Filter system explanation (Model, Provider, Test Group) +- Implementation guidelines for AND logic filtering +- Visualization examples and best practices + +### 2. [visualization_example.ts](./visualization_example.ts) +**Practical TypeScript implementation example** +- Complete working example of a cost vs performance scatter plot +- Filter application with real code +- Statistics update implementation +- Event handler setup for interactive filtering + +### 3. [data_relationships_quickref.md](./data_relationships_quickref.md) +**Quick reference for data relationships and common operations** +- Key relationships between data sections +- Common query patterns +- Performance optimization tips +- Data validation checks + +## Key Concepts + +### Data Flow +``` +CSV Input → preprocess_merbench_data.py → JSON Output → Visualizations + ↓ + Cost Calculations + (from costs.json) +``` + +### Filter Types (AND Logic) +1. **Model Filter**: Select specific models +2. **Provider Filter**: Google, Amazon, OpenAI, Other +3. **Test Group Filter**: easy, medium, hard + +### Primary Data Sections +- **raw_data**: Source for all filtering and aggregation +- **leaderboard**: Pre-aggregated model rankings +- **pareto_data**: Performance vs efficiency metrics +- **test_groups_data**: Performance by difficulty +- **cost_breakdown_data**: Detailed cost analysis +- **failure_analysis_data**: Failure reason counts + +## Quick Start + +1. **Load the JSON data** + ```javascript + const data = await fetch('processed_results.json').then(r => r.json()); + ``` + +2. **Apply filters to raw_data** + ```javascript + const filtered = data.raw_data.filter(row => + row.provider === "Google" && row.test_group === "easy" + ); + ``` + +3. **Recalculate aggregates** + ```javascript + const modelStats = {}; + filtered.forEach(row => { + if (!modelStats[row.Model]) { + modelStats[row.Model] = {runs: 0, success: 0, cost: 0}; + } + modelStats[row.Model].runs++; + modelStats[row.Model].success += row.Score_MermaidDiagramValid; + modelStats[row.Model].cost += row.total_cost; + }); + ``` + +4. **Create visualizations** + - Use pre-aggregated data for initial views + - Recalculate from filtered raw_data when filters change + - Update all related visualizations together + +## Cost Calculation Notes + +- Costs are calculated per token using tiered pricing from `costs.json` +- Failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost +- Input and output costs are tracked separately +- Thinking tokens may have different pricing than regular output tokens + +## Visualization Types + +1. **Leaderboard Table**: Model rankings by success rate +2. **Pareto Scatter Plot**: Performance vs cost/duration/tokens +3. **Grouped Bar Charts**: Performance by test difficulty +4. **Stacked Bar Charts**: Failure reasons, cost breakdown +5. **Heatmaps**: Model × difficulty performance matrix + +## Tips for Developers + +- Always start filtering from `raw_data` +- Cache filter results for performance +- Use the `provider` field for color coding +- Show active filters in the UI +- Handle empty filter results gracefully +- Consider log scale for cost axes due to wide ranges \ No newline at end of file diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md new file mode 100644 index 0000000..a463870 --- /dev/null +++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md @@ -0,0 +1,143 @@ +# Data Relationships Quick Reference + +## Key Relationships + +### Primary Keys and Groupings + +1. **Model** - Primary identifier across all data sections +2. **test_group** - Secondary grouping (easy, medium, hard) +3. **provider** - Derived from Model name (Google, Amazon, etc.) + +### Data Section Dependencies + +``` +raw_data (source) + ↓ +├── leaderboard (group by Model) +├── pareto_data (group by Model) +├── test_groups_data (group by Model + test_group) +├── failure_analysis_data (group by Model, count failures) +└── cost_breakdown_data (group by Model + test_group) +``` + +## Common Queries and Aggregations + +### 1. Get Model Performance Summary +```javascript +// From raw_data +const modelSummary = rawData + .filter(r => r.Model === "gemini-2.5-pro") + .reduce((acc, r) => ({ + successRate: acc.successRate + r.Score_MermaidDiagramValid, + totalCost: acc.totalCost + r.total_cost, + count: acc.count + 1 + }), {successRate: 0, totalCost: 0, count: 0}); + +modelSummary.avgSuccessRate = modelSummary.successRate / modelSummary.count * 100; +``` + +### 2. Filter by Multiple Conditions +```javascript +// Get Amazon models on hard tests that succeeded +const filtered = rawData.filter(r => + r.provider === "Amazon" && + r.test_group === "hard" && + r.Score_MermaidDiagramValid === 1 +); +``` + +### 3. Calculate Cost Breakdown by Test Group +```javascript +// Group costs by difficulty +const costByDifficulty = {}; +["easy", "medium", "hard"].forEach(group => { + const groupData = rawData.filter(r => r.test_group === group); + costByDifficulty[group] = { + avgCost: groupData.reduce((sum, r) => sum + r.total_cost, 0) / groupData.length, + totalCost: groupData.reduce((sum, r) => sum + r.total_cost, 0) + }; +}); +``` + +## Pre-Aggregated vs. Raw Data Usage + +### Use Pre-Aggregated Data When: +- Displaying initial unfiltered views +- Performance is critical +- Standard aggregations are sufficient + +### Recalculate from Raw Data When: +- Filters are applied +- Custom aggregations needed +- Combining multiple filter conditions + +## Filter Application Order + +1. **Start with raw_data** +2. **Apply filters** (Model AND Provider AND TestGroup) +3. **Recalculate aggregations** +4. **Update visualizations** + +## Cost Calculation Rules + +- **Normal tests**: Cost = (input_tokens/1M × input_price) + (output_tokens/1M × output_price) +- **Failed tests** (Score_UsageLimitNotExceeded = 0): Cost = $0 +- **Tiered pricing**: Price depends on total token count + +## Data Validation Checks + +```javascript +// Ensure data consistency +function validateData(jsonData) { + // Check if model counts match + const rawModels = new Set(jsonData.raw_data.map(r => r.Model)); + const leaderboardModels = new Set(jsonData.leaderboard.map(l => l.Model)); + + console.assert(rawModels.size === leaderboardModels.size, + "Model count mismatch between raw and leaderboard"); + + // Verify cost calculations + jsonData.raw_data.forEach(row => { + if (row.Score_UsageLimitNotExceeded === 0) { + console.assert(row.total_cost === 0, + `Failed test should have 0 cost: ${row.Model}`); + } + }); +} +``` + +## Performance Optimization Tips + +1. **Cache Filter Results** + ```javascript + const filterCache = new Map(); + function getCachedFilter(filterKey, rawData, filters) { + if (!filterCache.has(filterKey)) { + filterCache.set(filterKey, applyFilters(rawData, filters)); + } + return filterCache.get(filterKey); + } + ``` + +2. **Use Indexed Lookups** + ```javascript + // Pre-index by model for fast lookups + const modelIndex = {}; + rawData.forEach(row => { + if (!modelIndex[row.Model]) modelIndex[row.Model] = []; + modelIndex[row.Model].push(row); + }); + ``` + +3. **Batch Updates** + ```javascript + // Update all visualizations at once + function updateAllVisualizations(filteredData) { + requestAnimationFrame(() => { + updateLeaderboard(filteredData); + updateParetoPlot(filteredData); + updateCostBreakdown(filteredData); + updateFailureAnalysis(filteredData); + }); + } + ``` \ No newline at end of file diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md new file mode 100644 index 0000000..c5bbc18 --- /dev/null +++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md @@ -0,0 +1,285 @@ +# Merbench Output JSON Schema Documentation + +## Overview + +The preprocessed JSON output from `preprocess_merbench_data.py` contains structured evaluation data for LLM models tested on Mermaid diagram generation tasks. This document describes the schema and how to build visualizations with filtering capabilities. + +## Filtering System + +The visualization system supports three filter dimensions that work as logical AND conditions: + +1. **Test Group Filter**: `easy`, `medium`, `hard` +2. **Provider Filter**: `Google`, `Amazon`, `OpenAI`, `Other` +3. **Model Filter**: Individual model selection (e.g., `gemini-2.5-pro`, `bedrock:us.amazon.nova-premier-v1:0`) + +When filters are applied, only data matching ALL selected criteria is included in visualizations. + +## JSON Schema Structure + +### 1. Stats Object +```json +{ + "stats": { + "total_runs": 180, + "models_evaluated": 12, + "test_cases": 3, + "test_groups": ["easy", "medium", "hard"], + "providers": ["Google", "Amazon", "OpenAI"], + "models": ["model1", "model2", ...], + "total_cost": 11.899786, + "avg_cost_per_run": 0.066110 + } +} +``` + +**Usage**: Display overall evaluation metrics. Update these counts when filters are applied by recalculating from filtered raw_data. + +### 2. Leaderboard Array +```json +{ + "leaderboard": [ + { + "Model": "gemini-2.5-pro-preview-06-05", + "Success_Rate": 40.0, + "Avg_Duration": 46.887859, + "Avg_Tokens": 8693.733333, + "Avg_Cost": 0.045514, + "Avg_Input_Cost": 0.005918, + "Avg_Output_Cost": 0.039596, + "Runs": 15, + "Provider": "Google" + } + ] +} +``` + +**Usage**: Main leaderboard visualization. When filters are applied: +1. Filter `raw_data` based on selected criteria +2. Recalculate aggregates (group by Model) +3. Sort by Success_Rate descending + +**Visualization**: Table or bar chart showing model rankings. + +### 3. Pareto Data Array +```json +{ + "pareto_data": [ + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Success_Rate": 4.0, + "Duration": 14.524447, + "total_tokens": 2350.2, + "total_cost": 0.000080, + "input_cost": 0.000034, + "output_cost": 0.000046, + "Metric_request_tokens": 1337.8, + "Metric_response_tokens": 1012.4 + } + ] +} +``` + +**Usage**: Create Pareto frontier plots showing trade-offs between: +- Success Rate vs Cost +- Success Rate vs Duration +- Success Rate vs Tokens + +**Filtering**: Recalculate from filtered raw_data, then plot. + +### 4. Test Groups Data Array +```json +{ + "test_groups_data": [ + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "test_group": "easy", + "Score_MermaidDiagramValid": 0.2, + "Score_UsageLimitNotExceeded": 1.0, + "Score_UsedBothMCPTools": 1.0, + "total_cost": 0.000114, + "input_cost": 0.000049, + "output_cost": 0.000066, + "total_tokens": 3377.2 + } + ] +} +``` + +**Usage**: Create grouped bar charts showing performance by difficulty level. + +**Filtering**: +- Filter by Model and Provider +- Group remaining data by test_group +- Show comparison across difficulty levels + +### 5. Failure Analysis Data Array +```json +{ + "failure_analysis_data": [ + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "Invalid Diagram": 13, + "MCP Tool Failure": 0, + "Usage Limit Exceeded": 0 + } + ] +} +``` + +**Usage**: Stacked bar chart showing failure reasons by model. + +**Filtering**: When filters are applied, recalculate failure counts from filtered raw_data. + +### 6. Cost Breakdown Data Array +```json +{ + "cost_breakdown_data": [ + { + "Model": "bedrock:us.amazon.nova-lite-v1:0", + "test_group": "easy", + "avg_total_cost": 0.000114, + "sum_total_cost": 0.00057, + "run_count": 5, + "avg_input_cost": 0.000049, + "sum_input_cost": 0.000243, + "avg_output_cost": 0.000066, + "sum_output_cost": 0.000328 + } + ] +} +``` + +**Usage**: Detailed cost analysis by model and test difficulty. + +**Visualizations**: +- Stacked bar chart of input vs output costs +- Heatmap of costs by model × difficulty +- Cost comparison across test groups + +### 7. Raw Data Array +```json +{ + "raw_data": [ + { + "Model": "gemini-2.5-pro-preview-05-06", + "Case": "simple_easy", + "test_group": "easy", + "Duration": 18.090633, + "Score_MermaidDiagramValid": 0, + "Score_UsageLimitNotExceeded": 0, + "Score_UsedBothMCPTools": 1, + "total_tokens": 127419, + "provider": "Google", + "Metric_request_tokens": 25483, + "Metric_response_tokens": 101936, + "total_cost": 0.0, + "input_cost": 0.0, + "output_cost": 0.0 + } + ] +} +``` + +**Usage**: Source data for all filtering and aggregation operations. + +**Key fields for filtering**: +- `Model`: For model filter +- `provider`: For provider filter +- `test_group`: For difficulty filter +- `Score_UsageLimitNotExceeded`: When 0, cost is 0 (failed tests) + +## Implementing Filters + +### Filter Logic (Pseudo-code) +```javascript +function applyFilters(rawData, filters) { + return rawData.filter(row => { + // All conditions must be true (AND logic) + const modelMatch = !filters.models.length || + filters.models.includes(row.Model); + const providerMatch = !filters.providers.length || + filters.providers.includes(row.provider); + const testGroupMatch = !filters.testGroups.length || + filters.testGroups.includes(row.test_group); + + return modelMatch && providerMatch && testGroupMatch; + }); +} +``` + +### Updating Visualizations After Filtering + +1. **Filter raw_data** using the AND logic +2. **Recalculate aggregates** from filtered data: + - Leaderboard: Group by Model, calculate means + - Test groups: Group by Model + test_group + - Failure analysis: Count failures per type + - Cost breakdown: Sum and average costs +3. **Update visualizations** with new data + +## Visualization Examples + +### 1. Cost vs Performance Scatter Plot +- X-axis: Average cost (from pareto_data) +- Y-axis: Success rate (from pareto_data) +- Color: Provider +- Size: Number of runs +- Filters affect which models appear + +### 2. Performance by Difficulty Grouped Bar Chart +- X-axis: Models +- Y-axis: Success rate +- Groups: easy, medium, hard (different bars) +- Filters reduce which models/groups are shown + +### 3. Cost Breakdown Stacked Bar +- X-axis: Models +- Y-axis: Cost +- Stack: Input cost vs Output cost +- Facet by: test_group (optional) + +### 4. Failure Analysis Heatmap +- Rows: Models +- Columns: Failure types +- Values: Failure counts +- Color intensity: Number of failures + +## Configuration Object +```json +{ + "config": { + "title": "🧜‍♀️ Merbench - LLM Evaluation", + "description": "...", + "primary_metric": { + "name": "Success_Rate", + "label": "Success Rate (%)" + } + } +} +``` + +**Usage**: UI configuration and metric definitions for consistent labeling. + +## Best Practices + +1. **Always filter on raw_data first**, then recalculate aggregates +2. **Cache filtered results** to avoid recalculation on every interaction +3. **Show filter status** in UI (e.g., "Showing 5 of 12 models") +4. **Handle empty results** gracefully when filters exclude all data +5. **Cost considerations**: Remember that failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost + +## Example Filter Combinations + +1. **"Show only Amazon models on hard tests"** + - Provider filter: ["Amazon"] + - Test group filter: ["hard"] + - Result: Only Amazon model performance on hard difficulty + +2. **"Compare Google models across all difficulties"** + - Provider filter: ["Google"] + - Test group filter: [] (empty = all) + - Result: All Google models, all difficulties + +3. **"Show specific model performance"** + - Model filter: ["gemini-2.5-pro"] + - Result: Single model data across all test groups \ No newline at end of file diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts new file mode 100644 index 0000000..7995a56 --- /dev/null +++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts @@ -0,0 +1,430 @@ +/** + * Example: Creating a Cost vs Performance Scatter Plot with Filtering (TypeScript) + * This example demonstrates how to use the JSON output data with filters + */ + +// Type definitions for the data structures + +interface RawDataRow { + Model: string; + Case: string; + test_group: 'easy' | 'medium' | 'hard'; + Duration: number; + Score_MermaidDiagramValid: 0 | 1; + Score_UsageLimitNotExceeded: 0 | 1; + Score_UsedBothMCPTools: 0 | 1; + total_tokens: number; + provider: 'Google' | 'Amazon' | 'OpenAI' | 'Other'; + Metric_request_tokens: number; + Metric_response_tokens: number; + total_cost: number; + input_cost: number; + output_cost: number; +} + +interface LeaderboardEntry { + Model: string; + Success_Rate: number; + Avg_Duration: number; + Avg_Tokens: number; + Avg_Cost: number; + Avg_Input_Cost: number; + Avg_Output_Cost: number; + Runs: number; + Provider: string; +} + +interface ParetoDataEntry { + Model: string; + Success_Rate: number; + Duration: number; + total_tokens: number; + total_cost: number; + input_cost: number; + output_cost: number; + Metric_request_tokens: number; + Metric_response_tokens: number; +} + +interface TestGroupDataEntry { + Model: string; + test_group: 'easy' | 'medium' | 'hard'; + Score_MermaidDiagramValid: number; + Score_UsageLimitNotExceeded: number; + Score_UsedBothMCPTools: number; + total_cost: number; + input_cost: number; + output_cost: number; + total_tokens: number; +} + +interface FailureAnalysisEntry { + Model: string; + 'Invalid Diagram': number; + 'MCP Tool Failure': number; + 'Usage Limit Exceeded': number; +} + +interface CostBreakdownEntry { + Model: string; + test_group: 'easy' | 'medium' | 'hard'; + avg_total_cost: number; + sum_total_cost: number; + run_count: number; + avg_input_cost: number; + sum_input_cost: number; + avg_output_cost: number; + sum_output_cost: number; +} + +interface Stats { + total_runs: number; + models_evaluated: number; + test_cases: number; + test_groups: Array<'easy' | 'medium' | 'hard'>; + providers: Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>; + models: string[]; + total_cost: number; + avg_cost_per_run: number; +} + +interface Config { + title: string; + description: string; + primary_metric: { + name: string; + label: string; + }; +} + +interface JsonData { + stats: Stats; + leaderboard: LeaderboardEntry[]; + pareto_data: ParetoDataEntry[]; + test_groups_data: TestGroupDataEntry[]; + failure_analysis_data: FailureAnalysisEntry[]; + cost_breakdown_data: CostBreakdownEntry[]; + raw_data: RawDataRow[]; + config: Config; +} + +interface Filters { + models: string[]; + providers: Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>; + testGroups: Array<'easy' | 'medium' | 'hard'>; +} + +interface AggregatedModel { + model: string; + provider: string; + avgCost: number; + avgTokens: number; + successRate: number; + avgDuration: number; + runCount: number; +} + +interface ModelGroup { + model: string; + provider: string; + runs: number; + totalCost: number; + totalTokens: number; + successCount: number; + totalDuration: number; +} + +interface PlotlyTrace { + x: number[]; + y: number[]; + text: string[]; + marker: { + size: number[]; + }; + mode: string; + type: string; + name: string; + textposition: string; +} + +interface PlotlyLayout { + title: string; + xaxis: { + title: string; + type?: string; + tickformat?: string; + }; + yaxis: { + title: string; + range?: number[]; + }; + hovermode: string; + showlegend: boolean; + annotations?: Array<{ + x: number; + y: number; + xref: string; + yref: string; + text: string; + showarrow: boolean; + font: { + size: number; + color: string; + }; + }>; +} + +interface StatsDisplay { + totalRuns: number; + modelsShown: number; + totalCost: number; + avgCostPerRun: number; +} + +// Sample filter state +const filters: Filters = { + models: [], // Empty array means "all models" + providers: ["Google", "Amazon"], // Only Google and Amazon models + testGroups: ["easy", "medium"] // Exclude "hard" tests +}; + +// Function to apply filters to raw data +function applyFilters(rawData: RawDataRow[], filters: Filters): RawDataRow[] { + return rawData.filter((row: RawDataRow): boolean => { + // Check each filter condition (AND logic) + const modelMatch: boolean = filters.models.length === 0 || + filters.models.includes(row.Model); + + const providerMatch: boolean = filters.providers.length === 0 || + filters.providers.includes(row.provider); + + const testGroupMatch: boolean = filters.testGroups.length === 0 || + filters.testGroups.includes(row.test_group); + + return modelMatch && providerMatch && testGroupMatch; + }); +} + +// Function to calculate aggregated data from filtered raw data +function calculateAggregates(filteredData: RawDataRow[]): AggregatedModel[] { + // Group by model + const modelGroups: Record = {}; + + filteredData.forEach((row: RawDataRow): void => { + if (!modelGroups[row.Model]) { + modelGroups[row.Model] = { + model: row.Model, + provider: row.provider, + runs: 0, + totalCost: 0, + totalTokens: 0, + successCount: 0, + totalDuration: 0 + }; + } + + const group: ModelGroup = modelGroups[row.Model]; + group.runs++; + group.totalCost += row.total_cost; + group.totalTokens += row.total_tokens; + group.successCount += row.Score_MermaidDiagramValid; + group.totalDuration += row.Duration; + }); + + // Calculate averages + return Object.values(modelGroups).map((group: ModelGroup): AggregatedModel => ({ + model: group.model, + provider: group.provider, + avgCost: group.totalCost / group.runs, + avgTokens: group.totalTokens / group.runs, + successRate: (group.successCount / group.runs) * 100, + avgDuration: group.totalDuration / group.runs, + runCount: group.runs + })); +} + +// Declare Plotly for global access +declare const Plotly: { + newPlot(elementId: string, traces: PlotlyTrace[], layout: PlotlyLayout): void; +}; + +// Function to create Plotly scatter plot +function createCostVsPerformancePlot(jsonData: JsonData, filters: Filters): void { + // Step 1: Apply filters to raw data + const filteredRawData: RawDataRow[] = applyFilters(jsonData.raw_data, filters); + + // Step 2: Calculate aggregates + const aggregatedData: AggregatedModel[] = calculateAggregates(filteredRawData); + + // Step 3: Prepare data for Plotly + const traces: Record = {}; + + // Group by provider for different colors + aggregatedData.forEach((model: AggregatedModel): void => { + if (!traces[model.provider]) { + traces[model.provider] = { + x: [], + y: [], + text: [], + marker: { size: [] }, + mode: 'markers+text', + type: 'scatter', + name: model.provider, + textposition: 'top center' + }; + } + + traces[model.provider].x.push(model.avgCost); + traces[model.provider].y.push(model.successRate); + traces[model.provider].text.push(model.model); + traces[model.provider].marker.size.push(Math.sqrt(model.runCount) * 10); + }); + + // Step 4: Create the plot + const layout: PlotlyLayout = { + title: 'Model Performance vs Cost (Filtered)', + xaxis: { + title: 'Average Cost ($)', + type: 'log', // Log scale for better cost distribution + tickformat: '$.4f' + }, + yaxis: { + title: 'Success Rate (%)', + range: [0, 105] + }, + hovermode: 'closest', + showlegend: true, + annotations: [{ + x: 0.5, + y: -0.15, + xref: 'paper', + yref: 'paper', + text: `Filters: ${filters.providers.join(', ')} providers | ${filters.testGroups.join(', ')} tests`, + showarrow: false, + font: { size: 12, color: 'gray' } + }] + }; + + Plotly.newPlot('cost-performance-plot', Object.values(traces), layout); +} + +// Function to update summary statistics +function updateStats(jsonData: JsonData, filteredRawData: RawDataRow[]): void { + const stats: StatsDisplay = { + totalRuns: filteredRawData.length, + modelsShown: new Set(filteredRawData.map((r: RawDataRow) => r.Model)).size, + totalCost: filteredRawData.reduce((sum: number, r: RawDataRow) => sum + r.total_cost, 0), + avgCostPerRun: 0 + }; + + stats.avgCostPerRun = stats.totalCost / stats.totalRuns || 0; + + // Update UI elements (with null checks) + const totalRunsElement = document.getElementById('stat-total-runs'); + const modelsElement = document.getElementById('stat-models'); + const totalCostElement = document.getElementById('stat-total-cost'); + const avgCostElement = document.getElementById('stat-avg-cost'); + + if (totalRunsElement) { + totalRunsElement.textContent = stats.totalRuns.toString(); + } + + if (modelsElement) { + modelsElement.textContent = `${stats.modelsShown} of ${jsonData.stats.models_evaluated}`; + } + + if (totalCostElement) { + totalCostElement.textContent = `$${stats.totalCost.toFixed(2)}`; + } + + if (avgCostElement) { + avgCostElement.textContent = `$${stats.avgCostPerRun.toFixed(4)}`; + } +} + +// Helper function to get selected values from multi-select element +function getSelectedValues(element: HTMLSelectElement): string[] { + return Array.from(element.selectedOptions).map((option: HTMLOptionElement) => option.value); +} + +// Main function to initialize visualization +async function initializeVisualization(): Promise { + try { + // Load the JSON data + const response: Response = await fetch('path/to/processed_results.json'); + + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + const jsonData: JsonData = await response.json(); + + // Apply initial filters and create plot + createCostVsPerformancePlot(jsonData, filters); + + // Update statistics + const filteredData: RawDataRow[] = applyFilters(jsonData.raw_data, filters); + updateStats(jsonData, filteredData); + + // Set up filter change handlers + const providerFilterElement = document.getElementById('provider-filter') as HTMLSelectElement; + const testGroupFilterElement = document.getElementById('test-group-filter') as HTMLSelectElement; + + if (providerFilterElement) { + providerFilterElement.addEventListener('change', (e: Event): void => { + const target = e.target as HTMLSelectElement; + filters.providers = getSelectedValues(target) as Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>; + createCostVsPerformancePlot(jsonData, filters); + updateStats(jsonData, applyFilters(jsonData.raw_data, filters)); + }); + } + + if (testGroupFilterElement) { + testGroupFilterElement.addEventListener('change', (e: Event): void => { + const target = e.target as HTMLSelectElement; + filters.testGroups = getSelectedValues(target) as Array<'easy' | 'medium' | 'hard'>; + createCostVsPerformancePlot(jsonData, filters); + updateStats(jsonData, applyFilters(jsonData.raw_data, filters)); + }); + } + + // Set up model filter if it exists + const modelFilterElement = document.getElementById('model-filter') as HTMLSelectElement; + if (modelFilterElement) { + modelFilterElement.addEventListener('change', (e: Event): void => { + const target = e.target as HTMLSelectElement; + filters.models = getSelectedValues(target); + createCostVsPerformancePlot(jsonData, filters); + updateStats(jsonData, applyFilters(jsonData.raw_data, filters)); + }); + } + + } catch (error) { + console.error('Error initializing visualization:', error); + + // Show error message to user + const errorElement = document.getElementById('error-message'); + if (errorElement) { + errorElement.textContent = `Failed to load data: ${error instanceof Error ? error.message : 'Unknown error'}`; + errorElement.style.display = 'block'; + } + } +} + +// Initialize on page load +document.addEventListener('DOMContentLoaded', initializeVisualization); + +// Export types for use in other modules +export type { + RawDataRow, + JsonData, + Filters, + AggregatedModel, + LeaderboardEntry, + ParetoDataEntry, + TestGroupDataEntry, + FailureAnalysisEntry, + CostBreakdownEntry, + Stats, + Config +}; \ No newline at end of file diff --git a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py index a71d13a..8def71d 100644 --- a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py +++ b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py @@ -5,6 +5,7 @@ import argparse from datetime import datetime from pathlib import Path +from typing import Dict, List, Any # Add parent directory to path to import modules sys.path.append(str(Path(__file__).parent.parent)) @@ -13,6 +14,105 @@ from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig from agents_mcp_usage.utils import get_project_root +def load_model_costs(file_path: Path) -> Dict[str, Any]: + """Load model costs from JSON file.""" + with open(file_path, 'r') as f: + data = json.load(f) + + # Convert "inf" strings to actual infinity + def convert_inf_strings(obj: Any) -> Any: + if isinstance(obj, dict): + return {k: convert_inf_strings(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_inf_strings(v) for v in obj] + elif obj == "inf": + return float("inf") + return obj + + # Extract model_costs from the loaded data + model_costs = data.get('model_costs', {}) + return convert_inf_strings(model_costs) + +def get_price_for_tokens(token_count: int, price_tiers: List[Dict]) -> float: + """Find the correct price for a given number of tokens from a list of tiers.""" + for tier in price_tiers: + if token_count <= tier["up_to"]: + return tier["price"] + return price_tiers[-1]["price"] # Fallback to the highest tier price + +def calculate_costs(df: pd.DataFrame, cost_config: Dict, config: DashboardConfig) -> pd.DataFrame: + """Calculate input, output, and total costs for each run based on tiered pricing. + + Sets cost to 0 if Score_UsageLimitNotExceeded == 0. + """ + df_with_costs = df.copy() + + # Get cost calculation config from dashboard config + cost_calc_config = config.cost_calculation + input_token_cols = cost_calc_config.input_token_cols + output_token_cols = cost_calc_config.output_token_cols + + # Initialize cost columns + df_with_costs["input_cost"] = 0.0 + df_with_costs["output_cost"] = 0.0 + df_with_costs["total_cost"] = 0.0 + + for idx, row in df_with_costs.iterrows(): + # Check if usage limit was exceeded - if so, cost is 0 + if row.get("Score_UsageLimitNotExceeded", 1) == 0: + continue + + model = row.get("Model") + model_costs = cost_config.get(model) + + if not model_costs: + continue + + try: + # Calculate token counts + input_tokens = sum(row.get(col, 0) or 0 for col in input_token_cols) + output_tokens = sum(row.get(col, 0) or 0 for col in output_token_cols) + thinking_tokens = row.get("thinking_tokens", 0) or 0 + non_thinking_output_tokens = output_tokens - thinking_tokens + + total_tokens = input_tokens + output_tokens + + # Calculate input cost + input_price_tiers = model_costs.get("input", []) + if input_price_tiers: + input_price = get_price_for_tokens(total_tokens, input_price_tiers) + input_cost = (input_tokens / 1_000_000) * input_price + else: + input_cost = 0.0 + + # Calculate output cost + output_cost = 0.0 + output_pricing = model_costs.get("output", {}) + + if "thinking" in output_pricing and thinking_tokens > 0: + thinking_price_tiers = output_pricing["thinking"] + thinking_price = get_price_for_tokens(total_tokens, thinking_price_tiers) + output_cost += (thinking_tokens / 1_000_000) * thinking_price + + if "non_thinking" in output_pricing and non_thinking_output_tokens > 0: + non_thinking_price_tiers = output_pricing["non_thinking"] + non_thinking_price = get_price_for_tokens(total_tokens, non_thinking_price_tiers) + output_cost += (non_thinking_output_tokens / 1_000_000) * non_thinking_price + + elif "default" in output_pricing: + default_price_tiers = output_pricing["default"] + default_price = get_price_for_tokens(total_tokens, default_price_tiers) + output_cost += (output_tokens / 1_000_000) * default_price + + df_with_costs.at[idx, "input_cost"] = input_cost + df_with_costs.at[idx, "output_cost"] = output_cost + df_with_costs.at[idx, "total_cost"] = input_cost + output_cost + + except (TypeError, KeyError, IndexError) as e: + print(f"Cost calculation error for model {model} at row {idx}: {e}") + + return df_with_costs + def parse_metric_details(metric_details_str): """Safely parse JSON string from Metric_details column.""" if pd.isna(metric_details_str) or not metric_details_str: @@ -51,6 +151,11 @@ def process_csv_for_static_site(csv_path): # Load configuration config = DashboardConfig(**DEFAULT_CONFIG) + # Load cost configuration + project_root = get_project_root() + costs_json_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "costs.json" + cost_config = load_model_costs(costs_json_path) if costs_json_path.exists() else {} + # Read CSV df = pd.read_csv(csv_path) @@ -95,16 +200,22 @@ def extract_provider(model_name): df["provider"] = df["Model"].apply(extract_provider) + # Calculate costs for each row + df = calculate_costs(df, cost_config, config) + # Create leaderboard data leaderboard = df.groupby("Model").agg({ "Success_Rate": "mean", "Duration": "mean", "total_tokens": "mean", + "total_cost": "mean", + "input_cost": "mean", + "output_cost": "mean", "Case": "count", # Number of runs "provider": "first" }).reset_index() - leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Runs", "Provider"] + leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Avg_Cost", "Avg_Input_Cost", "Avg_Output_Cost", "Runs", "Provider"] leaderboard = leaderboard.sort_values("Success_Rate", ascending=False) # Create data for Pareto frontier plot @@ -112,6 +223,9 @@ def extract_provider(model_name): "Success_Rate": "mean", "Duration": "mean", "total_tokens": "mean", + "total_cost": "mean", + "input_cost": "mean", + "output_cost": "mean", "Metric_request_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0, "Metric_response_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0 }).reset_index() @@ -123,12 +237,36 @@ def extract_provider(model_name): test_groups_data = df.groupby(["Model", "test_group"]).agg({ "Score_MermaidDiagramValid": "mean", "Score_UsageLimitNotExceeded": "mean", - "Score_UsedBothMCPTools": "mean" + "Score_UsedBothMCPTools": "mean", + "total_cost": "mean", + "input_cost": "mean", + "output_cost": "mean", + "total_tokens": "mean" }).reset_index() # Calculate failure analysis data failure_analysis_data = calculate_failure_analysis_data(df) + # Calculate cost breakdown by model and test group + cost_breakdown_data = df.groupby(["Model", "test_group"]).agg({ + "total_cost": ["mean", "sum", "count"], + "input_cost": ["mean", "sum"], + "output_cost": ["mean", "sum"] + }).round(6) + + # Flatten the multi-level columns + cost_breakdown_data.columns = ['_'.join(col).strip() for col in cost_breakdown_data.columns.values] + cost_breakdown_data = cost_breakdown_data.reset_index() + cost_breakdown_data = cost_breakdown_data.rename(columns={ + "total_cost_mean": "avg_total_cost", + "total_cost_sum": "sum_total_cost", + "total_cost_count": "run_count", + "input_cost_mean": "avg_input_cost", + "input_cost_sum": "sum_input_cost", + "output_cost_mean": "avg_output_cost", + "output_cost_sum": "sum_output_cost" + }) + # Calculate aggregate statistics stats = { "total_runs": len(df), @@ -136,7 +274,9 @@ def extract_provider(model_name): "test_cases": df["Case"].nunique(), "test_groups": sorted(df["test_group"].unique().tolist()), "providers": sorted(df["provider"].unique().tolist()), - "models": sorted(df["Model"].unique().tolist()) + "models": sorted(df["Model"].unique().tolist()), + "total_cost": df["total_cost"].sum(), + "avg_cost_per_run": df["total_cost"].mean() } # Create final data structure @@ -146,11 +286,13 @@ def extract_provider(model_name): "pareto_data": pareto_data.to_dict(orient="records"), "test_groups_data": test_groups_data.to_dict(orient="records"), "failure_analysis_data": failure_analysis_data, + "cost_breakdown_data": cost_breakdown_data.to_dict("records"), "raw_data": df[[ "Model", "Case", "test_group", "Duration", "Score_MermaidDiagramValid", "Score_UsageLimitNotExceeded", "Score_UsedBothMCPTools", "total_tokens", "provider", - "Metric_request_tokens", "Metric_response_tokens" + "Metric_request_tokens", "Metric_response_tokens", + "total_cost", "input_cost", "output_cost" ]].to_dict(orient="records"), "config": { "title": config.title,