From 759f68c30410c940129182d9e6bad8c2c37618e6 Mon Sep 17 00:00:00 2001
From: Andrew Ginns <ginns.aw@gmail.com>
Date: Fri, 4 Jul 2025 15:42:49 +0000
Subject: [PATCH 1/3] feat: Swap to uv native build backend and add upgrade
 target in Makefile

---
 Makefile       | 3 +++
 pyproject.toml | 9 +++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 477824e..a47e39f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,9 @@ install:
 	uv sync
 	npm install -g @mermaid-js/mermaid-cli
 
+upgrade:
+	uv sync -U
+
 lint:
 	uv run ruff check .
 
diff --git a/pyproject.toml b/pyproject.toml
index 57b6f77..5b5c0cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,9 +25,10 @@ dependencies = [
     "streamlit>=1.45.1",
 ]
 
-[tool.hatch.build.targets.wheel]
-packages = ["agents_mcp_usage"]
+[tool.uv.build-backend]
+module-name = "agents_mcp_usage"
+module-root = ""
 
 [build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
+requires = ["uv_build>=0.7.19,<0.8.0"]
+build-backend = "uv_build"

From 5812b0ebdc6c8bb60cc900dd4e696284d22fe4b9 Mon Sep 17 00:00:00 2001
From: Andrew Ginns <ginns.aw@gmail.com>
Date: Sun, 6 Jul 2025 07:34:27 +0000
Subject: [PATCH 2/3] chore: Update costs data

---
 .../evaluations/mermaid_evals/costs.json      | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/agents_mcp_usage/evaluations/mermaid_evals/costs.json b/agents_mcp_usage/evaluations/mermaid_evals/costs.json
index b3f7e0a..cea4117 100644
--- a/agents_mcp_usage/evaluations/mermaid_evals/costs.json
+++ b/agents_mcp_usage/evaluations/mermaid_evals/costs.json
@@ -195,6 +195,29 @@
         ]
       }
     },
+    "gemini-2.5-flash-preview-05-20": {
+      "friendly_name": "Gemini 2.5 Flash Preview (May)",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.15
+        }
+      ],
+      "output": {
+        "non_thinking": [
+          {
+            "up_to": "inf",
+            "price": 0.6
+          }
+        ],
+        "thinking": [
+          {
+            "up_to": "inf",
+            "price": 3.5
+          }
+        ]
+      }
+    },
     "gemini-2.5-flash-preview": {
       "friendly_name": "Gemini 2.5 Flash Preview",
       "input": [
@@ -429,6 +452,74 @@
           }
         ]
       }
+    },
+    "bedrock:us.amazon.nova-micro-v1:0": {
+      "friendly_name": "Amazon Nova Micro",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.035
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.14
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-lite-v1:0": {
+      "friendly_name": "Amazon Nova Lite",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.06
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.24
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-pro-v1:0": {
+      "friendly_name": "Amazon Nova Pro",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.80
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 3.20
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-premier-v1:0": {
+      "friendly_name": "Amazon Nova Premier",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 2.50
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 12.50
+          }
+        ]
+      }
     }
   }
 }
\ No newline at end of file

From 5c30203f1e39ffc1e1475e01e40b9d4c43ea564b Mon Sep 17 00:00:00 2001
From: Andrew Ginns <ginns.aw@gmail.com>
Date: Sun, 6 Jul 2025 10:29:31 +0000
Subject: [PATCH 3/3] feat: Costs added to output json to remove runtime
 calculations

---
 .../mermaid_evals/results/docs/README.md      | 105 +++++
 .../docs/data_relationships_quickref.md       | 143 ++++++
 .../results/docs/output_json_schema.md        | 285 ++++++++++++
 .../results/docs/visualization_example.ts     | 430 ++++++++++++++++++
 .../scripts/preprocess_merbench_data.py       | 150 +++++-
 5 files changed, 1109 insertions(+), 4 deletions(-)
 create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md
 create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md
 create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md
 create mode 100644 agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts

diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md
new file mode 100644
index 0000000..75d45c2
--- /dev/null
+++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md
@@ -0,0 +1,105 @@
+# Merbench Visualization Documentation
+
+This directory contains comprehensive documentation for understanding and using the Merbench evaluation output data to build visualizations.
+
+## Documentation Files
+
+### 1. [output_json_schema.md](./output_json_schema.md)
+**Complete schema reference for the preprocessed JSON output**
+- Detailed description of all data sections
+- Filter system explanation (Model, Provider, Test Group)
+- Implementation guidelines for AND logic filtering
+- Visualization examples and best practices
+
+### 2. [visualization_example.ts](./visualization_example.ts)
+**Practical TypeScript implementation example**
+- Complete working example of a cost vs performance scatter plot
+- Filter application with real code
+- Statistics update implementation
+- Event handler setup for interactive filtering
+
+### 3. [data_relationships_quickref.md](./data_relationships_quickref.md)
+**Quick reference for data relationships and common operations**
+- Key relationships between data sections
+- Common query patterns
+- Performance optimization tips
+- Data validation checks
+
+## Key Concepts
+
+### Data Flow
+```
+CSV Input → preprocess_merbench_data.py → JSON Output → Visualizations
+                           ↓
+                    Cost Calculations
+                    (from costs.json)
+```
+
+### Filter Types (AND Logic)
+1. **Model Filter**: Select specific models
+2. **Provider Filter**: Google, Amazon, OpenAI, Other
+3. **Test Group Filter**: easy, medium, hard
+
+### Primary Data Sections
+- **raw_data**: Source for all filtering and aggregation
+- **leaderboard**: Pre-aggregated model rankings
+- **pareto_data**: Performance vs efficiency metrics
+- **test_groups_data**: Performance by difficulty
+- **cost_breakdown_data**: Detailed cost analysis
+- **failure_analysis_data**: Failure reason counts
+
+## Quick Start
+
+1. **Load the JSON data**
+   ```javascript
+   const data = await fetch('processed_results.json').then(r => r.json());
+   ```
+
+2. **Apply filters to raw_data**
+   ```javascript
+   const filtered = data.raw_data.filter(row => 
+     row.provider === "Google" && row.test_group === "easy"
+   );
+   ```
+
+3. **Recalculate aggregates**
+   ```javascript
+   const modelStats = {};
+   filtered.forEach(row => {
+     if (!modelStats[row.Model]) {
+       modelStats[row.Model] = {runs: 0, success: 0, cost: 0};
+     }
+     modelStats[row.Model].runs++;
+     modelStats[row.Model].success += row.Score_MermaidDiagramValid;
+     modelStats[row.Model].cost += row.total_cost;
+   });
+   ```
+
+4. **Create visualizations**
+   - Use pre-aggregated data for initial views
+   - Recalculate from filtered raw_data when filters change
+   - Update all related visualizations together
+
+## Cost Calculation Notes
+
+- Costs are calculated per token using tiered pricing from `costs.json`
+- Failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost
+- Input and output costs are tracked separately
+- Thinking tokens may have different pricing than regular output tokens
+
+## Visualization Types
+
+1. **Leaderboard Table**: Model rankings by success rate
+2. **Pareto Scatter Plot**: Performance vs cost/duration/tokens
+3. **Grouped Bar Charts**: Performance by test difficulty
+4. **Stacked Bar Charts**: Failure reasons, cost breakdown
+5. **Heatmaps**: Model × difficulty performance matrix
+
+## Tips for Developers
+
+- Always start filtering from `raw_data`
+- Cache filter results for performance
+- Use the `provider` field for color coding
+- Show active filters in the UI
+- Handle empty filter results gracefully
+- Consider log scale for cost axes due to wide ranges 
\ No newline at end of file
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md
new file mode 100644
index 0000000..a463870
--- /dev/null
+++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md
@@ -0,0 +1,143 @@
+# Data Relationships Quick Reference
+
+## Key Relationships
+
+### Primary Keys and Groupings
+
+1. **Model** - Primary identifier across all data sections
+2. **test_group** - Secondary grouping (easy, medium, hard)
+3. **provider** - Derived from Model name (Google, Amazon, etc.)
+
+### Data Section Dependencies
+
+```
+raw_data (source)
+    ↓
+├── leaderboard (group by Model)
+├── pareto_data (group by Model)
+├── test_groups_data (group by Model + test_group)
+├── failure_analysis_data (group by Model, count failures)
+└── cost_breakdown_data (group by Model + test_group)
+```
+
+## Common Queries and Aggregations
+
+### 1. Get Model Performance Summary
+```javascript
+// From raw_data
+const modelSummary = rawData
+  .filter(r => r.Model === "gemini-2.5-pro")
+  .reduce((acc, r) => ({
+    successRate: acc.successRate + r.Score_MermaidDiagramValid,
+    totalCost: acc.totalCost + r.total_cost,
+    count: acc.count + 1
+  }), {successRate: 0, totalCost: 0, count: 0});
+
+modelSummary.avgSuccessRate = modelSummary.successRate / modelSummary.count * 100;
+```
+
+### 2. Filter by Multiple Conditions
+```javascript
+// Get Amazon models on hard tests that succeeded
+const filtered = rawData.filter(r => 
+  r.provider === "Amazon" &&
+  r.test_group === "hard" &&
+  r.Score_MermaidDiagramValid === 1
+);
+```
+
+### 3. Calculate Cost Breakdown by Test Group
+```javascript
+// Group costs by difficulty
+const costByDifficulty = {};
+["easy", "medium", "hard"].forEach(group => {
+  const groupData = rawData.filter(r => r.test_group === group);
+  costByDifficulty[group] = {
+    avgCost: groupData.reduce((sum, r) => sum + r.total_cost, 0) / groupData.length,
+    totalCost: groupData.reduce((sum, r) => sum + r.total_cost, 0)
+  };
+});
+```
+
+## Pre-Aggregated vs. Raw Data Usage
+
+### Use Pre-Aggregated Data When:
+- Displaying initial unfiltered views
+- Performance is critical
+- Standard aggregations are sufficient
+
+### Recalculate from Raw Data When:
+- Filters are applied
+- Custom aggregations needed
+- Combining multiple filter conditions
+
+## Filter Application Order
+
+1. **Start with raw_data**
+2. **Apply filters** (Model AND Provider AND TestGroup)
+3. **Recalculate aggregations**
+4. **Update visualizations**
+
+## Cost Calculation Rules
+
+- **Normal tests**: Cost = (input_tokens/1M × input_price) + (output_tokens/1M × output_price)
+- **Failed tests** (Score_UsageLimitNotExceeded = 0): Cost = $0
+- **Tiered pricing**: Price depends on total token count
+
+## Data Validation Checks
+
+```javascript
+// Ensure data consistency
+function validateData(jsonData) {
+  // Check if model counts match
+  const rawModels = new Set(jsonData.raw_data.map(r => r.Model));
+  const leaderboardModels = new Set(jsonData.leaderboard.map(l => l.Model));
+  
+  console.assert(rawModels.size === leaderboardModels.size, 
+    "Model count mismatch between raw and leaderboard");
+  
+  // Verify cost calculations
+  jsonData.raw_data.forEach(row => {
+    if (row.Score_UsageLimitNotExceeded === 0) {
+      console.assert(row.total_cost === 0, 
+        `Failed test should have 0 cost: ${row.Model}`);
+    }
+  });
+}
+```
+
+## Performance Optimization Tips
+
+1. **Cache Filter Results**
+   ```javascript
+   const filterCache = new Map();
+   function getCachedFilter(filterKey, rawData, filters) {
+     if (!filterCache.has(filterKey)) {
+       filterCache.set(filterKey, applyFilters(rawData, filters));
+     }
+     return filterCache.get(filterKey);
+   }
+   ```
+
+2. **Use Indexed Lookups**
+   ```javascript
+   // Pre-index by model for fast lookups
+   const modelIndex = {};
+   rawData.forEach(row => {
+     if (!modelIndex[row.Model]) modelIndex[row.Model] = [];
+     modelIndex[row.Model].push(row);
+   });
+   ```
+
+3. **Batch Updates**
+   ```javascript
+   // Update all visualizations at once
+   function updateAllVisualizations(filteredData) {
+     requestAnimationFrame(() => {
+       updateLeaderboard(filteredData);
+       updateParetoPlot(filteredData);
+       updateCostBreakdown(filteredData);
+       updateFailureAnalysis(filteredData);
+     });
+   }
+   ``` 
\ No newline at end of file
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md
new file mode 100644
index 0000000..c5bbc18
--- /dev/null
+++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/output_json_schema.md
@@ -0,0 +1,285 @@
+# Merbench Output JSON Schema Documentation
+
+## Overview
+
+The preprocessed JSON output from `preprocess_merbench_data.py` contains structured evaluation data for LLM models tested on Mermaid diagram generation tasks. This document describes the schema and how to build visualizations with filtering capabilities.
+
+## Filtering System
+
+The visualization system supports three filter dimensions that work as logical AND conditions:
+
+1. **Test Group Filter**: `easy`, `medium`, `hard`
+2. **Provider Filter**: `Google`, `Amazon`, `OpenAI`, `Other`
+3. **Model Filter**: Individual model selection (e.g., `gemini-2.5-pro`, `bedrock:us.amazon.nova-premier-v1:0`)
+
+When filters are applied, only data matching ALL selected criteria is included in visualizations.
+
+## JSON Schema Structure
+
+### 1. Stats Object
+```json
+{
+  "stats": {
+    "total_runs": 180,
+    "models_evaluated": 12,
+    "test_cases": 3,
+    "test_groups": ["easy", "medium", "hard"],
+    "providers": ["Google", "Amazon", "OpenAI"],
+    "models": ["model1", "model2", ...],
+    "total_cost": 11.899786,
+    "avg_cost_per_run": 0.066110
+  }
+}
+```
+
+**Usage**: Display overall evaluation metrics. Update these counts when filters are applied by recalculating from filtered raw_data.
+
+### 2. Leaderboard Array
+```json
+{
+  "leaderboard": [
+    {
+      "Model": "gemini-2.5-pro-preview-06-05",
+      "Success_Rate": 40.0,
+      "Avg_Duration": 46.887859,
+      "Avg_Tokens": 8693.733333,
+      "Avg_Cost": 0.045514,
+      "Avg_Input_Cost": 0.005918,
+      "Avg_Output_Cost": 0.039596,
+      "Runs": 15,
+      "Provider": "Google"
+    }
+  ]
+}
+```
+
+**Usage**: Main leaderboard visualization. When filters are applied:
+1. Filter `raw_data` based on selected criteria
+2. Recalculate aggregates (group by Model)
+3. Sort by Success_Rate descending
+
+**Visualization**: Table or bar chart showing model rankings.
+
+### 3. Pareto Data Array
+```json
+{
+  "pareto_data": [
+    {
+      "Model": "bedrock:us.amazon.nova-lite-v1:0",
+      "Success_Rate": 4.0,
+      "Duration": 14.524447,
+      "total_tokens": 2350.2,
+      "total_cost": 0.000080,
+      "input_cost": 0.000034,
+      "output_cost": 0.000046,
+      "Metric_request_tokens": 1337.8,
+      "Metric_response_tokens": 1012.4
+    }
+  ]
+}
+```
+
+**Usage**: Create Pareto frontier plots showing trade-offs between:
+- Success Rate vs Cost
+- Success Rate vs Duration
+- Success Rate vs Tokens
+
+**Filtering**: Recalculate from filtered raw_data, then plot.
+
+### 4. Test Groups Data Array
+```json
+{
+  "test_groups_data": [
+    {
+      "Model": "bedrock:us.amazon.nova-lite-v1:0",
+      "test_group": "easy",
+      "Score_MermaidDiagramValid": 0.2,
+      "Score_UsageLimitNotExceeded": 1.0,
+      "Score_UsedBothMCPTools": 1.0,
+      "total_cost": 0.000114,
+      "input_cost": 0.000049,
+      "output_cost": 0.000066,
+      "total_tokens": 3377.2
+    }
+  ]
+}
+```
+
+**Usage**: Create grouped bar charts showing performance by difficulty level.
+
+**Filtering**: 
+- Filter by Model and Provider
+- Group remaining data by test_group
+- Show comparison across difficulty levels
+
+### 5. Failure Analysis Data Array
+```json
+{
+  "failure_analysis_data": [
+    {
+      "Model": "bedrock:us.amazon.nova-lite-v1:0",
+      "Invalid Diagram": 13,
+      "MCP Tool Failure": 0,
+      "Usage Limit Exceeded": 0
+    }
+  ]
+}
+```
+
+**Usage**: Stacked bar chart showing failure reasons by model.
+
+**Filtering**: When filters are applied, recalculate failure counts from filtered raw_data.
+
+### 6. Cost Breakdown Data Array
+```json
+{
+  "cost_breakdown_data": [
+    {
+      "Model": "bedrock:us.amazon.nova-lite-v1:0",
+      "test_group": "easy",
+      "avg_total_cost": 0.000114,
+      "sum_total_cost": 0.00057,
+      "run_count": 5,
+      "avg_input_cost": 0.000049,
+      "sum_input_cost": 0.000243,
+      "avg_output_cost": 0.000066,
+      "sum_output_cost": 0.000328
+    }
+  ]
+}
+```
+
+**Usage**: Detailed cost analysis by model and test difficulty.
+
+**Visualizations**:
+- Stacked bar chart of input vs output costs
+- Heatmap of costs by model × difficulty
+- Cost comparison across test groups
+
+### 7. Raw Data Array
+```json
+{
+  "raw_data": [
+    {
+      "Model": "gemini-2.5-pro-preview-05-06",
+      "Case": "simple_easy",
+      "test_group": "easy",
+      "Duration": 18.090633,
+      "Score_MermaidDiagramValid": 0,
+      "Score_UsageLimitNotExceeded": 0,
+      "Score_UsedBothMCPTools": 1,
+      "total_tokens": 127419,
+      "provider": "Google",
+      "Metric_request_tokens": 25483,
+      "Metric_response_tokens": 101936,
+      "total_cost": 0.0,
+      "input_cost": 0.0,
+      "output_cost": 0.0
+    }
+  ]
+}
+```
+
+**Usage**: Source data for all filtering and aggregation operations.
+
+**Key fields for filtering**:
+- `Model`: For model filter
+- `provider`: For provider filter  
+- `test_group`: For difficulty filter
+- `Score_UsageLimitNotExceeded`: When 0, cost is 0 (failed tests)
+
+## Implementing Filters
+
+### Filter Logic (Pseudo-code)
+```javascript
+function applyFilters(rawData, filters) {
+  return rawData.filter(row => {
+    // All conditions must be true (AND logic)
+    const modelMatch = !filters.models.length || 
+                      filters.models.includes(row.Model);
+    const providerMatch = !filters.providers.length || 
+                         filters.providers.includes(row.provider);
+    const testGroupMatch = !filters.testGroups.length || 
+                          filters.testGroups.includes(row.test_group);
+    
+    return modelMatch && providerMatch && testGroupMatch;
+  });
+}
+```
+
+### Updating Visualizations After Filtering
+
+1. **Filter raw_data** using the AND logic
+2. **Recalculate aggregates** from filtered data:
+   - Leaderboard: Group by Model, calculate means
+   - Test groups: Group by Model + test_group
+   - Failure analysis: Count failures per type
+   - Cost breakdown: Sum and average costs
+3. **Update visualizations** with new data
+
+## Visualization Examples
+
+### 1. Cost vs Performance Scatter Plot
+- X-axis: Average cost (from pareto_data)
+- Y-axis: Success rate (from pareto_data)
+- Color: Provider
+- Size: Number of runs
+- Filters affect which models appear
+
+### 2. Performance by Difficulty Grouped Bar Chart
+- X-axis: Models
+- Y-axis: Success rate
+- Groups: easy, medium, hard (different bars)
+- Filters reduce which models/groups are shown
+
+### 3. Cost Breakdown Stacked Bar
+- X-axis: Models
+- Y-axis: Cost
+- Stack: Input cost vs Output cost
+- Facet by: test_group (optional)
+
+### 4. Failure Analysis Heatmap
+- Rows: Models
+- Columns: Failure types
+- Values: Failure counts
+- Color intensity: Number of failures
+
+## Configuration Object
+```json
+{
+  "config": {
+    "title": "🧜‍♀️ Merbench - LLM Evaluation",
+    "description": "...",
+    "primary_metric": {
+      "name": "Success_Rate",
+      "label": "Success Rate (%)"
+    }
+  }
+}
+```
+
+**Usage**: UI configuration and metric definitions for consistent labeling.
+
+## Best Practices
+
+1. **Always filter on raw_data first**, then recalculate aggregates
+2. **Cache filtered results** to avoid recalculation on every interaction
+3. **Show filter status** in UI (e.g., "Showing 5 of 12 models")
+4. **Handle empty results** gracefully when filters exclude all data
+5. **Cost considerations**: Remember that failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost
+
+## Example Filter Combinations
+
+1. **"Show only Amazon models on hard tests"**
+   - Provider filter: ["Amazon"]
+   - Test group filter: ["hard"]
+   - Result: Only Amazon model performance on hard difficulty
+
+2. **"Compare Google models across all difficulties"**
+   - Provider filter: ["Google"]
+   - Test group filter: [] (empty = all)
+   - Result: All Google models, all difficulties
+
+3. **"Show specific model performance"**
+   - Model filter: ["gemini-2.5-pro"]
+   - Result: Single model data across all test groups 
\ No newline at end of file
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts
new file mode 100644
index 0000000..7995a56
--- /dev/null
+++ b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/visualization_example.ts
@@ -0,0 +1,430 @@
+/**
+ * Example: Creating a Cost vs Performance Scatter Plot with Filtering (TypeScript)
+ * This example demonstrates how to use the JSON output data with filters
+ */
+
+// Type definitions for the data structures
+
+interface RawDataRow {
+  Model: string;
+  Case: string;
+  test_group: 'easy' | 'medium' | 'hard';
+  Duration: number;
+  Score_MermaidDiagramValid: 0 | 1;
+  Score_UsageLimitNotExceeded: 0 | 1;
+  Score_UsedBothMCPTools: 0 | 1;
+  total_tokens: number;
+  provider: 'Google' | 'Amazon' | 'OpenAI' | 'Other';
+  Metric_request_tokens: number;
+  Metric_response_tokens: number;
+  total_cost: number;
+  input_cost: number;
+  output_cost: number;
+}
+
+interface LeaderboardEntry {
+  Model: string;
+  Success_Rate: number;
+  Avg_Duration: number;
+  Avg_Tokens: number;
+  Avg_Cost: number;
+  Avg_Input_Cost: number;
+  Avg_Output_Cost: number;
+  Runs: number;
+  Provider: string;
+}
+
+interface ParetoDataEntry {
+  Model: string;
+  Success_Rate: number;
+  Duration: number;
+  total_tokens: number;
+  total_cost: number;
+  input_cost: number;
+  output_cost: number;
+  Metric_request_tokens: number;
+  Metric_response_tokens: number;
+}
+
+interface TestGroupDataEntry {
+  Model: string;
+  test_group: 'easy' | 'medium' | 'hard';
+  Score_MermaidDiagramValid: number;
+  Score_UsageLimitNotExceeded: number;
+  Score_UsedBothMCPTools: number;
+  total_cost: number;
+  input_cost: number;
+  output_cost: number;
+  total_tokens: number;
+}
+
+interface FailureAnalysisEntry {
+  Model: string;
+  'Invalid Diagram': number;
+  'MCP Tool Failure': number;
+  'Usage Limit Exceeded': number;
+}
+
+interface CostBreakdownEntry {
+  Model: string;
+  test_group: 'easy' | 'medium' | 'hard';
+  avg_total_cost: number;
+  sum_total_cost: number;
+  run_count: number;
+  avg_input_cost: number;
+  sum_input_cost: number;
+  avg_output_cost: number;
+  sum_output_cost: number;
+}
+
+interface Stats {
+  total_runs: number;
+  models_evaluated: number;
+  test_cases: number;
+  test_groups: Array<'easy' | 'medium' | 'hard'>;
+  providers: Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>;
+  models: string[];
+  total_cost: number;
+  avg_cost_per_run: number;
+}
+
+interface Config {
+  title: string;
+  description: string;
+  primary_metric: {
+    name: string;
+    label: string;
+  };
+}
+
+interface JsonData {
+  stats: Stats;
+  leaderboard: LeaderboardEntry[];
+  pareto_data: ParetoDataEntry[];
+  test_groups_data: TestGroupDataEntry[];
+  failure_analysis_data: FailureAnalysisEntry[];
+  cost_breakdown_data: CostBreakdownEntry[];
+  raw_data: RawDataRow[];
+  config: Config;
+}
+
+interface Filters {
+  models: string[];
+  providers: Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>;
+  testGroups: Array<'easy' | 'medium' | 'hard'>;
+}
+
+interface AggregatedModel {
+  model: string;
+  provider: string;
+  avgCost: number;
+  avgTokens: number;
+  successRate: number;
+  avgDuration: number;
+  runCount: number;
+}
+
+interface ModelGroup {
+  model: string;
+  provider: string;
+  runs: number;
+  totalCost: number;
+  totalTokens: number;
+  successCount: number;
+  totalDuration: number;
+}
+
+interface PlotlyTrace {
+  x: number[];
+  y: number[];
+  text: string[];
+  marker: {
+    size: number[];
+  };
+  mode: string;
+  type: string;
+  name: string;
+  textposition: string;
+}
+
+interface PlotlyLayout {
+  title: string;
+  xaxis: {
+    title: string;
+    type?: string;
+    tickformat?: string;
+  };
+  yaxis: {
+    title: string;
+    range?: number[];
+  };
+  hovermode: string;
+  showlegend: boolean;
+  annotations?: Array<{
+    x: number;
+    y: number;
+    xref: string;
+    yref: string;
+    text: string;
+    showarrow: boolean;
+    font: {
+      size: number;
+      color: string;
+    };
+  }>;
+}
+
+interface StatsDisplay {
+  totalRuns: number;
+  modelsShown: number;
+  totalCost: number;
+  avgCostPerRun: number;
+}
+
+// Sample filter state
+const filters: Filters = {
+  models: [],  // Empty array means "all models"
+  providers: ["Google", "Amazon"],  // Only Google and Amazon models
+  testGroups: ["easy", "medium"]    // Exclude "hard" tests
+};
+
+// Function to apply filters to raw data
+function applyFilters(rawData: RawDataRow[], filters: Filters): RawDataRow[] {
+  return rawData.filter((row: RawDataRow): boolean => {
+    // Check each filter condition (AND logic)
+    const modelMatch: boolean = filters.models.length === 0 || 
+                               filters.models.includes(row.Model);
+    
+    const providerMatch: boolean = filters.providers.length === 0 || 
+                                  filters.providers.includes(row.provider);
+    
+    const testGroupMatch: boolean = filters.testGroups.length === 0 || 
+                                   filters.testGroups.includes(row.test_group);
+    
+    return modelMatch && providerMatch && testGroupMatch;
+  });
+}
+
+// Function to calculate aggregated data from filtered raw data
+function calculateAggregates(filteredData: RawDataRow[]): AggregatedModel[] {
+  // Group by model
+  const modelGroups: Record<string, ModelGroup> = {};
+  
+  filteredData.forEach((row: RawDataRow): void => {
+    if (!modelGroups[row.Model]) {
+      modelGroups[row.Model] = {
+        model: row.Model,
+        provider: row.provider,
+        runs: 0,
+        totalCost: 0,
+        totalTokens: 0,
+        successCount: 0,
+        totalDuration: 0
+      };
+    }
+    
+    const group: ModelGroup = modelGroups[row.Model];
+    group.runs++;
+    group.totalCost += row.total_cost;
+    group.totalTokens += row.total_tokens;
+    group.successCount += row.Score_MermaidDiagramValid;
+    group.totalDuration += row.Duration;
+  });
+  
+  // Calculate averages
+  return Object.values(modelGroups).map((group: ModelGroup): AggregatedModel => ({
+    model: group.model,
+    provider: group.provider,
+    avgCost: group.totalCost / group.runs,
+    avgTokens: group.totalTokens / group.runs,
+    successRate: (group.successCount / group.runs) * 100,
+    avgDuration: group.totalDuration / group.runs,
+    runCount: group.runs
+  }));
+}
+
+// Declare Plotly for global access
+declare const Plotly: {
+  newPlot(elementId: string, traces: PlotlyTrace[], layout: PlotlyLayout): void;
+};
+
+// Function to create Plotly scatter plot
+function createCostVsPerformancePlot(jsonData: JsonData, filters: Filters): void {
+  // Step 1: Apply filters to raw data
+  const filteredRawData: RawDataRow[] = applyFilters(jsonData.raw_data, filters);
+  
+  // Step 2: Calculate aggregates
+  const aggregatedData: AggregatedModel[] = calculateAggregates(filteredRawData);
+  
+  // Step 3: Prepare data for Plotly
+  const traces: Record<string, PlotlyTrace> = {};
+  
+  // Group by provider for different colors
+  aggregatedData.forEach((model: AggregatedModel): void => {
+    if (!traces[model.provider]) {
+      traces[model.provider] = {
+        x: [],
+        y: [],
+        text: [],
+        marker: { size: [] },
+        mode: 'markers+text',
+        type: 'scatter',
+        name: model.provider,
+        textposition: 'top center'
+      };
+    }
+    
+    traces[model.provider].x.push(model.avgCost);
+    traces[model.provider].y.push(model.successRate);
+    traces[model.provider].text.push(model.model);
+    traces[model.provider].marker.size.push(Math.sqrt(model.runCount) * 10);
+  });
+  
+  // Step 4: Create the plot
+  const layout: PlotlyLayout = {
+    title: 'Model Performance vs Cost (Filtered)',
+    xaxis: {
+      title: 'Average Cost ($)',
+      type: 'log',  // Log scale for better cost distribution
+      tickformat: '$.4f'
+    },
+    yaxis: {
+      title: 'Success Rate (%)',
+      range: [0, 105]
+    },
+    hovermode: 'closest',
+    showlegend: true,
+    annotations: [{
+      x: 0.5,
+      y: -0.15,
+      xref: 'paper',
+      yref: 'paper',
+      text: `Filters: ${filters.providers.join(', ')} providers | ${filters.testGroups.join(', ')} tests`,
+      showarrow: false,
+      font: { size: 12, color: 'gray' }
+    }]
+  };
+  
+  Plotly.newPlot('cost-performance-plot', Object.values(traces), layout);
+}
+
+// Function to update summary statistics
+function updateStats(jsonData: JsonData, filteredRawData: RawDataRow[]): void {
+  const stats: StatsDisplay = {
+    totalRuns: filteredRawData.length,
+    modelsShown: new Set(filteredRawData.map((r: RawDataRow) => r.Model)).size,
+    totalCost: filteredRawData.reduce((sum: number, r: RawDataRow) => sum + r.total_cost, 0),
+    avgCostPerRun: 0
+  };
+  
+  stats.avgCostPerRun = stats.totalCost / stats.totalRuns || 0;
+  
+  // Update UI elements (with null checks)
+  const totalRunsElement = document.getElementById('stat-total-runs');
+  const modelsElement = document.getElementById('stat-models');
+  const totalCostElement = document.getElementById('stat-total-cost');
+  const avgCostElement = document.getElementById('stat-avg-cost');
+  
+  if (totalRunsElement) {
+    totalRunsElement.textContent = stats.totalRuns.toString();
+  }
+  
+  if (modelsElement) {
+    modelsElement.textContent = `${stats.modelsShown} of ${jsonData.stats.models_evaluated}`;
+  }
+  
+  if (totalCostElement) {
+    totalCostElement.textContent = `$${stats.totalCost.toFixed(2)}`;
+  }
+  
+  if (avgCostElement) {
+    avgCostElement.textContent = `$${stats.avgCostPerRun.toFixed(4)}`;
+  }
+}
+
+// Helper function to get selected values from multi-select element
+function getSelectedValues(element: HTMLSelectElement): string[] {
+  return Array.from(element.selectedOptions).map((option: HTMLOptionElement) => option.value);
+}
+
+// Main function to initialize visualization
+async function initializeVisualization(): Promise<void> {
+  try {
+    // Load the JSON data
+    const response: Response = await fetch('path/to/processed_results.json');
+    
+    if (!response.ok) {
+      throw new Error(`HTTP error! status: ${response.status}`);
+    }
+    
+    const jsonData: JsonData = await response.json();
+    
+    // Apply initial filters and create plot
+    createCostVsPerformancePlot(jsonData, filters);
+    
+    // Update statistics
+    const filteredData: RawDataRow[] = applyFilters(jsonData.raw_data, filters);
+    updateStats(jsonData, filteredData);
+    
+    // Set up filter change handlers
+    const providerFilterElement = document.getElementById('provider-filter') as HTMLSelectElement;
+    const testGroupFilterElement = document.getElementById('test-group-filter') as HTMLSelectElement;
+    
+    if (providerFilterElement) {
+      providerFilterElement.addEventListener('change', (e: Event): void => {
+        const target = e.target as HTMLSelectElement;
+        filters.providers = getSelectedValues(target) as Array<'Google' | 'Amazon' | 'OpenAI' | 'Other'>;
+        createCostVsPerformancePlot(jsonData, filters);
+        updateStats(jsonData, applyFilters(jsonData.raw_data, filters));
+      });
+    }
+    
+    if (testGroupFilterElement) {
+      testGroupFilterElement.addEventListener('change', (e: Event): void => {
+        const target = e.target as HTMLSelectElement;
+        filters.testGroups = getSelectedValues(target) as Array<'easy' | 'medium' | 'hard'>;
+        createCostVsPerformancePlot(jsonData, filters);
+        updateStats(jsonData, applyFilters(jsonData.raw_data, filters));
+      });
+    }
+    
+    // Set up model filter if it exists
+    const modelFilterElement = document.getElementById('model-filter') as HTMLSelectElement;
+    if (modelFilterElement) {
+      modelFilterElement.addEventListener('change', (e: Event): void => {
+        const target = e.target as HTMLSelectElement;
+        filters.models = getSelectedValues(target);
+        createCostVsPerformancePlot(jsonData, filters);
+        updateStats(jsonData, applyFilters(jsonData.raw_data, filters));
+      });
+    }
+    
+  } catch (error) {
+    console.error('Error initializing visualization:', error);
+    
+    // Show error message to user
+    const errorElement = document.getElementById('error-message');
+    if (errorElement) {
+      errorElement.textContent = `Failed to load data: ${error instanceof Error ? error.message : 'Unknown error'}`;
+      errorElement.style.display = 'block';
+    }
+  }
+}
+
+// Initialize on page load
+document.addEventListener('DOMContentLoaded', initializeVisualization);
+
+// Export types for use in other modules
+export type {
+  RawDataRow,
+  JsonData,
+  Filters,
+  AggregatedModel,
+  LeaderboardEntry,
+  ParetoDataEntry,
+  TestGroupDataEntry,
+  FailureAnalysisEntry,
+  CostBreakdownEntry,
+  Stats,
+  Config
+}; 
\ No newline at end of file
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py
index a71d13a..8def71d 100644
--- a/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py
+++ b/agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py
@@ -5,6 +5,7 @@
 import argparse
 from datetime import datetime
 from pathlib import Path
+from typing import Dict, List, Any
 
 # Add parent directory to path to import modules
 sys.path.append(str(Path(__file__).parent.parent))
@@ -13,6 +14,105 @@
 from agents_mcp_usage.evaluations.mermaid_evals.schemas import DashboardConfig
 from agents_mcp_usage.utils import get_project_root
 
+def load_model_costs(file_path: Path) -> Dict[str, Any]:
+    """Load model costs from JSON file."""
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    
+    # Convert "inf" strings to actual infinity
+    def convert_inf_strings(obj: Any) -> Any:
+        if isinstance(obj, dict):
+            return {k: convert_inf_strings(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [convert_inf_strings(v) for v in obj]
+        elif obj == "inf":
+            return float("inf")
+        return obj
+    
+    # Extract model_costs from the loaded data
+    model_costs = data.get('model_costs', {})
+    return convert_inf_strings(model_costs)
+
+def get_price_for_tokens(token_count: int, price_tiers: List[Dict]) -> float:
+    """Find the correct price for a given number of tokens from a list of tiers."""
+    for tier in price_tiers:
+        if token_count <= tier["up_to"]:
+            return tier["price"]
+    return price_tiers[-1]["price"]  # Fallback to the highest tier price
+
+def calculate_costs(df: pd.DataFrame, cost_config: Dict, config: DashboardConfig) -> pd.DataFrame:
+    """Calculate input, output, and total costs for each run based on tiered pricing.
+    
+    Sets cost to 0 if Score_UsageLimitNotExceeded == 0.
+    """
+    df_with_costs = df.copy()
+    
+    # Get cost calculation config from dashboard config
+    cost_calc_config = config.cost_calculation
+    input_token_cols = cost_calc_config.input_token_cols
+    output_token_cols = cost_calc_config.output_token_cols
+    
+    # Initialize cost columns
+    df_with_costs["input_cost"] = 0.0
+    df_with_costs["output_cost"] = 0.0
+    df_with_costs["total_cost"] = 0.0
+    
+    for idx, row in df_with_costs.iterrows():
+        # Check if usage limit was exceeded - if so, cost is 0
+        if row.get("Score_UsageLimitNotExceeded", 1) == 0:
+            continue
+            
+        model = row.get("Model")
+        model_costs = cost_config.get(model)
+        
+        if not model_costs:
+            continue
+            
+        try:
+            # Calculate token counts
+            input_tokens = sum(row.get(col, 0) or 0 for col in input_token_cols)
+            output_tokens = sum(row.get(col, 0) or 0 for col in output_token_cols)
+            thinking_tokens = row.get("thinking_tokens", 0) or 0
+            non_thinking_output_tokens = output_tokens - thinking_tokens
+            
+            total_tokens = input_tokens + output_tokens
+            
+            # Calculate input cost
+            input_price_tiers = model_costs.get("input", [])
+            if input_price_tiers:
+                input_price = get_price_for_tokens(total_tokens, input_price_tiers)
+                input_cost = (input_tokens / 1_000_000) * input_price
+            else:
+                input_cost = 0.0
+                
+            # Calculate output cost
+            output_cost = 0.0
+            output_pricing = model_costs.get("output", {})
+            
+            if "thinking" in output_pricing and thinking_tokens > 0:
+                thinking_price_tiers = output_pricing["thinking"]
+                thinking_price = get_price_for_tokens(total_tokens, thinking_price_tiers)
+                output_cost += (thinking_tokens / 1_000_000) * thinking_price
+                
+            if "non_thinking" in output_pricing and non_thinking_output_tokens > 0:
+                non_thinking_price_tiers = output_pricing["non_thinking"]
+                non_thinking_price = get_price_for_tokens(total_tokens, non_thinking_price_tiers)
+                output_cost += (non_thinking_output_tokens / 1_000_000) * non_thinking_price
+                
+            elif "default" in output_pricing:
+                default_price_tiers = output_pricing["default"]
+                default_price = get_price_for_tokens(total_tokens, default_price_tiers)
+                output_cost += (output_tokens / 1_000_000) * default_price
+                
+            df_with_costs.at[idx, "input_cost"] = input_cost
+            df_with_costs.at[idx, "output_cost"] = output_cost
+            df_with_costs.at[idx, "total_cost"] = input_cost + output_cost
+            
+        except (TypeError, KeyError, IndexError) as e:
+            print(f"Cost calculation error for model {model} at row {idx}: {e}")
+            
+    return df_with_costs
+
 def parse_metric_details(metric_details_str):
     """Safely parse JSON string from Metric_details column."""
     if pd.isna(metric_details_str) or not metric_details_str:
@@ -51,6 +151,11 @@ def process_csv_for_static_site(csv_path):
     # Load configuration
     config = DashboardConfig(**DEFAULT_CONFIG)
     
+    # Load cost configuration
+    project_root = get_project_root()
+    costs_json_path = project_root / "agents_mcp_usage" / "evaluations" / "mermaid_evals" / "costs.json"
+    cost_config = load_model_costs(costs_json_path) if costs_json_path.exists() else {}
+    
     # Read CSV
     df = pd.read_csv(csv_path)
     
@@ -95,16 +200,22 @@ def extract_provider(model_name):
     
     df["provider"] = df["Model"].apply(extract_provider)
     
+    # Calculate costs for each row
+    df = calculate_costs(df, cost_config, config)
+    
     # Create leaderboard data
     leaderboard = df.groupby("Model").agg({
         "Success_Rate": "mean",
         "Duration": "mean",
         "total_tokens": "mean",
+        "total_cost": "mean",
+        "input_cost": "mean",
+        "output_cost": "mean",
         "Case": "count",  # Number of runs
         "provider": "first"
     }).reset_index()
     
-    leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Runs", "Provider"]
+    leaderboard.columns = ["Model", "Success_Rate", "Avg_Duration", "Avg_Tokens", "Avg_Cost", "Avg_Input_Cost", "Avg_Output_Cost", "Runs", "Provider"]
     leaderboard = leaderboard.sort_values("Success_Rate", ascending=False)
     
     # Create data for Pareto frontier plot
@@ -112,6 +223,9 @@ def extract_provider(model_name):
         "Success_Rate": "mean",
         "Duration": "mean",
         "total_tokens": "mean",
+        "total_cost": "mean",
+        "input_cost": "mean",
+        "output_cost": "mean",
         "Metric_request_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0,
         "Metric_response_tokens": lambda x: x[x > 0].mean() if any(x > 0) else 0
     }).reset_index()
@@ -123,12 +237,36 @@ def extract_provider(model_name):
     test_groups_data = df.groupby(["Model", "test_group"]).agg({
         "Score_MermaidDiagramValid": "mean",
         "Score_UsageLimitNotExceeded": "mean",
-        "Score_UsedBothMCPTools": "mean"
+        "Score_UsedBothMCPTools": "mean",
+        "total_cost": "mean",
+        "input_cost": "mean",
+        "output_cost": "mean",
+        "total_tokens": "mean"
     }).reset_index()
     
     # Calculate failure analysis data
     failure_analysis_data = calculate_failure_analysis_data(df)
     
+    # Calculate cost breakdown by model and test group
+    cost_breakdown_data = df.groupby(["Model", "test_group"]).agg({
+        "total_cost": ["mean", "sum", "count"],
+        "input_cost": ["mean", "sum"],
+        "output_cost": ["mean", "sum"]
+    }).round(6)
+    
+    # Flatten the multi-level columns
+    cost_breakdown_data.columns = ['_'.join(col).strip() for col in cost_breakdown_data.columns.values]
+    cost_breakdown_data = cost_breakdown_data.reset_index()
+    cost_breakdown_data = cost_breakdown_data.rename(columns={
+        "total_cost_mean": "avg_total_cost",
+        "total_cost_sum": "sum_total_cost", 
+        "total_cost_count": "run_count",
+        "input_cost_mean": "avg_input_cost",
+        "input_cost_sum": "sum_input_cost",
+        "output_cost_mean": "avg_output_cost",
+        "output_cost_sum": "sum_output_cost"
+    })
+    
     # Calculate aggregate statistics
     stats = {
         "total_runs": len(df),
@@ -136,7 +274,9 @@ def extract_provider(model_name):
         "test_cases": df["Case"].nunique(),
         "test_groups": sorted(df["test_group"].unique().tolist()),
         "providers": sorted(df["provider"].unique().tolist()),
-        "models": sorted(df["Model"].unique().tolist())
+        "models": sorted(df["Model"].unique().tolist()),
+        "total_cost": df["total_cost"].sum(),
+        "avg_cost_per_run": df["total_cost"].mean()
     }
     
     # Create final data structure
@@ -146,11 +286,13 @@ def extract_provider(model_name):
         "pareto_data": pareto_data.to_dict(orient="records"),
         "test_groups_data": test_groups_data.to_dict(orient="records"),
         "failure_analysis_data": failure_analysis_data,
+        "cost_breakdown_data": cost_breakdown_data.to_dict("records"),
         "raw_data": df[[
             "Model", "Case", "test_group", "Duration", 
             "Score_MermaidDiagramValid", "Score_UsageLimitNotExceeded", 
             "Score_UsedBothMCPTools", "total_tokens", "provider",
-            "Metric_request_tokens", "Metric_response_tokens"
+            "Metric_request_tokens", "Metric_response_tokens",
+            "total_cost", "input_cost", "output_cost"
         ]].to_dict(orient="records"),
         "config": {
             "title": config.title,