Skip to content

Commit b8dfad4

Browse files
committed
feat: update model benchmark schema and data
- Add required fields for all benchmark metrics in model schema - Update all model manifests with complete benchmark data - Add mmmu, mmmuPro, webDevArena fields - Update existing benchmark values (terminalBench, etc.) - Update cursor IDE manifest
1 parent 9397383 commit b8dfad4

28 files changed

+123
-36
lines changed

manifests/$schemas/model.schema.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,15 @@
9999
"description": "LiveCodeBench Pass@1 score (percentage)"
100100
}
101101
},
102+
"required": [
103+
"sweBench",
104+
"terminalBench",
105+
"mmmu",
106+
"mmmuPro",
107+
"webDevArena",
108+
"sciCode",
109+
"liveCodeBench"
110+
],
102111
"additionalProperties": false
103112
},
104113
"platformUrls": {

manifests/ides/cursor.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
"relatedProducts": [
8282
{
8383
"type": "cli",
84-
"productId": "cursor-agent"
84+
"productId": "cursor-cli"
8585
}
8686
],
8787
"platforms": [

manifests/models/claude-haiku-4-5.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,12 @@
2525
"capabilities": ["function-calling", "tool-choice", "structured-outputs", "reasoning"],
2626
"benchmarks": {
2727
"sweBench": null,
28-
"terminalBench": 0.298,
28+
"terminalBench": 0.355,
2929
"sciCode": null,
30-
"liveCodeBench": null
30+
"liveCodeBench": null,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/claude-opus-4-1.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
"inputModalities": ["text"],
2525
"capabilities": ["function-calling", "tool-choice", "structured-outputs"],
2626
"benchmarks": {
27-
"sweBench": null,
28-
"terminalBench": 0.38,
27+
"sweBench": 74.4,
28+
"terminalBench": 0.631,
2929
"sciCode": null,
30-
"liveCodeBench": 46.9
30+
"liveCodeBench": 46.9,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/claude-opus-4.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
"inputModalities": ["text"],
2525
"capabilities": ["function-calling", "tool-choice", "structured-outputs"],
2626
"benchmarks": {
27-
"sweBench": null,
27+
"sweBench": 67.6,
2828
"terminalBench": 0.578,
2929
"sciCode": null,
30-
"liveCodeBench": 56.6
30+
"liveCodeBench": 56.6,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/claude-sonnet-4-5.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
"inputModalities": ["text"],
2525
"capabilities": ["function-calling", "tool-choice", "structured-outputs"],
2626
"benchmarks": {
27-
"sweBench": null,
27+
"sweBench": 70.6,
2828
"terminalBench": 0.428,
2929
"sciCode": null,
30-
"liveCodeBench": 47.1
30+
"liveCodeBench": 47.1,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/claude-sonnet-4.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@
2424
"inputModalities": ["text"],
2525
"capabilities": ["function-calling", "tool-choice", "structured-outputs", "reasoning"],
2626
"benchmarks": {
27-
"sweBench": null,
27+
"sweBench": 64.93,
2828
"terminalBench": 0.428,
2929
"sciCode": null,
30-
"liveCodeBench": 55.9
30+
"liveCodeBench": 55.9,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/composer.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
"sweBench": null,
2828
"terminalBench": null,
2929
"sciCode": null,
30-
"liveCodeBench": null
30+
"liveCodeBench": null,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": null,

manifests/models/deepseek-r1.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
"sweBench": null,
2828
"terminalBench": null,
2929
"sciCode": 4.6,
30-
"liveCodeBench": 73.1
30+
"liveCodeBench": 73.1,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": "https://huggingface.co/deepseek-ai/DeepSeek-R1",

manifests/models/deepseek-v3-terminus.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@
2727
"sweBench": null,
2828
"terminalBench": null,
2929
"sciCode": 3.1,
30-
"liveCodeBench": 27.2
30+
"liveCodeBench": 27.2,
31+
"mmmu": null,
32+
"mmmuPro": null,
33+
"webDevArena": null
3134
},
3235
"platformUrls": {
3336
"huggingface": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus",

0 commit comments

Comments
 (0)