andrewginns · andrewginns · Jul 6, 2025 · Jul 4, 2025 · Jul 6, 2025 · Jul 6, 2025
diff --git a/Makefile b/Makefile
@@ -2,6 +2,9 @@ install:
 	uv sync
 	npm install -g @mermaid-js/mermaid-cli
 
+upgrade:
+	uv sync -U
+
 lint:
 	uv run ruff check .
 

diff --git a/agents_mcp_usage/evaluations/mermaid_evals/costs.json b/agents_mcp_usage/evaluations/mermaid_evals/costs.json
@@ -195,6 +195,29 @@
         ]
       }
     },
+    "gemini-2.5-flash-preview-05-20": {
+      "friendly_name": "Gemini 2.5 Flash Preview (May)",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.15
+        }
+      ],
+      "output": {
+        "non_thinking": [
+          {
+            "up_to": "inf",
+            "price": 0.6
+          }
+        ],
+        "thinking": [
+          {
+            "up_to": "inf",
+            "price": 3.5
+          }
+        ]
+      }
+    },
     "gemini-2.5-flash-preview": {
       "friendly_name": "Gemini 2.5 Flash Preview",
       "input": [
@@ -429,6 +452,74 @@
           }
         ]
       }
+    },
+    "bedrock:us.amazon.nova-micro-v1:0": {
+      "friendly_name": "Amazon Nova Micro",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.035
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.14
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-lite-v1:0": {
+      "friendly_name": "Amazon Nova Lite",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.06
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.24
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-pro-v1:0": {
+      "friendly_name": "Amazon Nova Pro",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.80
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 3.20
+          }
+        ]
+      }
+    },
+    "bedrock:us.amazon.nova-premier-v1:0": {
+      "friendly_name": "Amazon Nova Premier",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 2.50
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 12.50
+          }
+        ]
+      }
     }
   }
 }
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md b/agents_mcp_usage/evaluations/mermaid_evals/results/docs/README.md
@@ -0,0 +1,105 @@
+# Merbench Visualization Documentation
+
+This directory contains comprehensive documentation for understanding and using the Merbench evaluation output data to build visualizations.
+
+## Documentation Files
+
+### 1. [output_json_schema.md](./output_json_schema.md)
+**Complete schema reference for the preprocessed JSON output**
+- Detailed description of all data sections
+- Filter system explanation (Model, Provider, Test Group)
+- Implementation guidelines for AND logic filtering
+- Visualization examples and best practices
+
+### 2. [visualization_example.ts](./visualization_example.ts)
+**Practical TypeScript implementation example**
+- Complete working example of a cost vs performance scatter plot
+- Filter application with real code
+- Statistics update implementation
+- Event handler setup for interactive filtering
+
+### 3. [data_relationships_quickref.md](./data_relationships_quickref.md)
+**Quick reference for data relationships and common operations**
+- Key relationships between data sections
+- Common query patterns
+- Performance optimization tips
+- Data validation checks
+
+## Key Concepts
+
+### Data Flow
+```
+CSV Input → preprocess_merbench_data.py → JSON Output → Visualizations
+                           ↓
+                    Cost Calculations
+                    (from costs.json)
+```
+
+### Filter Types (AND Logic)
+1. **Model Filter**: Select specific models
+2. **Provider Filter**: Google, Amazon, OpenAI, Other
+3. **Test Group Filter**: easy, medium, hard
+
+### Primary Data Sections
+- **raw_data**: Source for all filtering and aggregation
+- **leaderboard**: Pre-aggregated model rankings
+- **pareto_data**: Performance vs efficiency metrics
+- **test_groups_data**: Performance by difficulty
+- **cost_breakdown_data**: Detailed cost analysis
+- **failure_analysis_data**: Failure reason counts
+
+## Quick Start
+
+1. **Load the JSON data**
+   ```javascript
+   const data = await fetch('processed_results.json').then(r => r.json());
+   ```
+
+2. **Apply filters to raw_data**
+   ```javascript
+   const filtered = data.raw_data.filter(row => 
+     row.provider === "Google" && row.test_group === "easy"
+   );
+   ```
+
+3. **Recalculate aggregates**
+   ```javascript
+   const modelStats = {};
+   filtered.forEach(row => {
+     if (!modelStats[row.Model]) {
+       modelStats[row.Model] = {runs: 0, success: 0, cost: 0};
+     }
+     modelStats[row.Model].runs++;
+     modelStats[row.Model].success += row.Score_MermaidDiagramValid;
+     modelStats[row.Model].cost += row.total_cost;
+   });
+   ```
+
+4. **Create visualizations**
+   - Use pre-aggregated data for initial views
+   - Recalculate from filtered raw_data when filters change
+   - Update all related visualizations together
+
+## Cost Calculation Notes
+
+- Costs are calculated per token using tiered pricing from `costs.json`
+- Failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost
+- Input and output costs are tracked separately
+- Thinking tokens may have different pricing than regular output tokens
+
+## Visualization Types
+
+1. **Leaderboard Table**: Model rankings by success rate
+2. **Pareto Scatter Plot**: Performance vs cost/duration/tokens
+3. **Grouped Bar Charts**: Performance by test difficulty
+4. **Stacked Bar Charts**: Failure reasons, cost breakdown
+5. **Heatmaps**: Model × difficulty performance matrix
+
+## Tips for Developers
+
+- Always start filtering from `raw_data`
+- Cache filter results for performance
+- Use the `provider` field for color coding
+- Show active filters in the UI
+- Handle empty filter results gracefully
+- Consider log scale for cost axes due to wide ranges 
diff --git a/...mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md b/...mcp_usage/evaluations/mermaid_evals/results/docs/data_relationships_quickref.md
@@ -0,0 +1,143 @@
+# Data Relationships Quick Reference
+
+## Key Relationships
+
+### Primary Keys and Groupings
+
+1. **Model** - Primary identifier across all data sections
+2. **test_group** - Secondary grouping (easy, medium, hard)
+3. **provider** - Derived from Model name (Google, Amazon, etc.)
+
+### Data Section Dependencies
+
+```
+raw_data (source)
+    ↓
+├── leaderboard (group by Model)
+├── pareto_data (group by Model)
+├── test_groups_data (group by Model + test_group)
+├── failure_analysis_data (group by Model, count failures)
+└── cost_breakdown_data (group by Model + test_group)
+```
+
+## Common Queries and Aggregations
+
+### 1. Get Model Performance Summary
+```javascript
+// From raw_data
+const modelSummary = rawData
+  .filter(r => r.Model === "gemini-2.5-pro")
+  .reduce((acc, r) => ({
+    successRate: acc.successRate + r.Score_MermaidDiagramValid,
+    totalCost: acc.totalCost + r.total_cost,
+    count: acc.count + 1
+  }), {successRate: 0, totalCost: 0, count: 0});
+
+modelSummary.avgSuccessRate = modelSummary.successRate / modelSummary.count * 100;
+```
+
+### 2. Filter by Multiple Conditions
+```javascript
+// Get Amazon models on hard tests that succeeded
+const filtered = rawData.filter(r => 
+  r.provider === "Amazon" &&
+  r.test_group === "hard" &&
+  r.Score_MermaidDiagramValid === 1
+);
+```
+
+### 3. Calculate Cost Breakdown by Test Group
+```javascript
+// Group costs by difficulty
+const costByDifficulty = {};
+["easy", "medium", "hard"].forEach(group => {
+  const groupData = rawData.filter(r => r.test_group === group);
+  costByDifficulty[group] = {
+    avgCost: groupData.reduce((sum, r) => sum + r.total_cost, 0) / groupData.length,
+    totalCost: groupData.reduce((sum, r) => sum + r.total_cost, 0)
+  };
+});
+```
+
+## Pre-Aggregated vs. Raw Data Usage
+
+### Use Pre-Aggregated Data When:
+- Displaying initial unfiltered views
+- Performance is critical
+- Standard aggregations are sufficient
+
+### Recalculate from Raw Data When:
+- Filters are applied
+- Custom aggregations needed
+- Combining multiple filter conditions
+
+## Filter Application Order
+
+1. **Start with raw_data**
+2. **Apply filters** (Model AND Provider AND TestGroup)
+3. **Recalculate aggregations**
+4. **Update visualizations**
+
+## Cost Calculation Rules
+
+- **Normal tests**: Cost = (input_tokens/1M × input_price) + (output_tokens/1M × output_price)
+- **Failed tests** (Score_UsageLimitNotExceeded = 0): Cost = $0
+- **Tiered pricing**: Price depends on total token count
+
+## Data Validation Checks
+
+```javascript
+// Ensure data consistency
+function validateData(jsonData) {
+  // Check if model counts match
+  const rawModels = new Set(jsonData.raw_data.map(r => r.Model));
+  const leaderboardModels = new Set(jsonData.leaderboard.map(l => l.Model));
+
+  console.assert(rawModels.size === leaderboardModels.size, 
+    "Model count mismatch between raw and leaderboard");
+
+  // Verify cost calculations
+  jsonData.raw_data.forEach(row => {
+    if (row.Score_UsageLimitNotExceeded === 0) {
+      console.assert(row.total_cost === 0, 
+        `Failed test should have 0 cost: ${row.Model}`);
+    }
+  });
+}
+```
+
+## Performance Optimization Tips
+
+1. **Cache Filter Results**
+   ```javascript
+   const filterCache = new Map();
+   function getCachedFilter(filterKey, rawData, filters) {
+     if (!filterCache.has(filterKey)) {
+       filterCache.set(filterKey, applyFilters(rawData, filters));
+     }
+     return filterCache.get(filterKey);
+   }
+   ```
+
+2. **Use Indexed Lookups**
+   ```javascript
+   // Pre-index by model for fast lookups
+   const modelIndex = {};
+   rawData.forEach(row => {
+     if (!modelIndex[row.Model]) modelIndex[row.Model] = [];
+     modelIndex[row.Model].push(row);
+   });
+   ```
+
+3. **Batch Updates**
+   ```javascript
+   // Update all visualizations at once
+   function updateAllVisualizations(filteredData) {
+     requestAnimationFrame(() => {
+       updateLeaderboard(filteredData);
+       updateParetoPlot(filteredData);
+       updateCostBreakdown(filteredData);
+       updateFailureAnalysis(filteredData);
+     });
+   }
+   ```