Skip to content

Commit 186704d

Browse files
FatPigeorzganler
andauthored
feat: evalperf leaderboard (#23)
* feat: evalperf * add llama-3.1-nemontron * add winrate * fix winrate * add deepseek chat link * a bunch of fixes * front page --------- Co-authored-by: ganler <[email protected]>
1 parent c9defa4 commit 186704d

File tree

29 files changed

+615
-126
lines changed

29 files changed

+615
-126
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,17 @@
11
repos:
22
- repo: https://github.com/pycqa/isort
3-
rev: 5.12.0
3+
rev: 5.13.2
44
hooks:
55
- id: isort
66
name: isort (python)
77
args: ["--profile", "black"]
88
- repo: https://github.com/psf/black
9-
rev: 22.6.0
9+
rev: 24.10.0
1010
hooks:
1111
- id: black
1212
- repo: https://github.com/pre-commit/pre-commit-hooks
13-
rev: v4.5.0
13+
rev: v5.0.0
1414
hooks:
1515
- id: check-yaml
1616
- id: end-of-file-fixer
1717
- id: trailing-whitespace
18-
- repo: https://github.com/pre-commit/mirrors-prettier
19-
rev: "v3.1.0"
20-
hooks:
21-
- id: prettier

evalperf.html

Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
<!doctype html>
2+
<html>
3+
<link rel="preconnect" href="https://fonts.googleapis.com" />
4+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
5+
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap" rel="stylesheet" />
6+
7+
8+
<head>
9+
<meta charset="UTF-8" />
10+
<title>
11+
EvalPerf: Evaluating Language Models for Efficient Code Generation
12+
</title>
13+
<script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
14+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/echarts.min.js"></script>
15+
<link rel="icon" href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f9d1-1f4bb.png" />
16+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" />
17+
18+
<link href="https://cdn.jsdelivr.net/npm/[email protected]/themes/prism.css" rel="stylesheet" />
19+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/components/prism-core.min.js"></script>
20+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/plugins/autoloader/prism-autoloader.min.js"></script>
21+
22+
<style>
23+
body {
24+
font-family: "JetBrains Mono", monospace;
25+
background-color: #ffffff;
26+
color: #000000;
27+
}
28+
29+
th,
30+
td {
31+
text-align: left;
32+
width: fit-content;
33+
font-size: larger;
34+
}
35+
36+
#notes h3 {
37+
margin-top: 1em;
38+
font-size: 2em;
39+
text-align: center;
40+
}
41+
</style>
42+
</head>
43+
44+
<body>
45+
<div id="content" class="container d-flex flex-column align-items-center gap-3">
46+
<h1 class="text-nowrap mt-5" style="font-size: xx-large;">
47+
<b>Evaluating LLMs for Efficient Code Generation</b>
48+
</h1>
49+
<div class="d-flex flex-row justify-content-center gap-3">
50+
<a href="https://openreview.net/forum?id=IBCBMeAhmC"><img
51+
src="https://img.shields.io/badge/Paper-COLM'24-a55fed.svg?style=for-the-badge"></a>
52+
<a href="https://github.com/evalplus/evalplus"><img
53+
src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white"
54+
alt="github" class="img-fluid" /></a>
55+
<a href="https://pypi.org/project/evalplus"><img alt="PyPI - Version"
56+
src="https://img.shields.io/pypi/v/evalplus?style=for-the-badge&labelColor=black" class="img-fluid" />
57+
</a>
58+
</div>
59+
<div class="container d-flex flex-row flex-nowrap fs-5">
60+
61+
62+
<div class="container d-flex flex-column align-items-center">
63+
<div>
64+
🚀 Code Efficiency Evaluation requires:
65+
<ul>
66+
<li><strong>Performance-exercising tasks & inputs</strong>
67+
</li>
68+
<li><strong>Meaningful compound metric:</strong>
69+
</li>
70+
</ul>
71+
<p>Based on <strong>Differential Performance Evaluation</strong>, the EvalPerf dataset (current
72+
version 20240328) includes:</p>
73+
<ul>
74+
<li>118 performance-exercising tasks</li>
75+
<li>Each task is equipped with a <i>computationally challenging test input</i> generated by the SaS
76+
generator</li>
77+
<li>Differential Performance Score (DPS): <i>"DPS=80"</i> means <i>"submissions can outperform 80%
78+
of LLM solutions..."</i></li>
79+
<li>Pairwise comparison of LLMs' code efficiency over common passing tasks to ablate correctness impact
80+
</li>
81+
</ul>
82+
Check out our <a href="https://jw-liu.xyz/assets/pdf/jiawei-colm-evalperf-poster.pdf">COLM'24 poster</a> for
83+
a more detailed overview!
84+
</div>
85+
86+
<pre style="padding-top: 0; padding-bottom: 0;">
87+
<code class="language-bash">
88+
pip install --upgrade "evalplus[perf,vllm] @ git+https://github.com/evalplus/evalplus"
89+
# Or `pip install "evalplus[perf,vllm]" --upgrade` for the latest stable release
90+
91+
sudo sh -c 'echo 0 > /proc/sys/kernel/perf_event_paranoid' # Enable perf
92+
evalplus.evalperf --model "ise-uiuc/Magicoder-S-DS-6.7B" \
93+
--backend vllm</code>
94+
</pre>
95+
<br />
96+
<table id="leaderboard"
97+
class="table table-responsive table-striped table-bordered flex-shrink-1 border border-5">
98+
</table>
99+
<h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
100+
<p>
101+
We thank
102+
<a href="https://openai.com/form/researcher-access-program/">OpenAI Researcher Access Program</a> for
103+
providing part of the compute.
104+
</p>
105+
</div>
106+
</div>
107+
</div>
108+
109+
<script>
110+
const contextTable = document.getElementById("leaderboard");
111+
const linkMapping = new Map([]);
112+
const hfLinkPrefix = "https://huggingface.co/";
113+
const dataUrlPrefix = "results/evalperf";
114+
115+
// Load data
116+
var data = null;
117+
var dataUrl = dataUrlPrefix + "/COMBINED-RESULTS.json";
118+
var xhr = new XMLHttpRequest();
119+
xhr.open("GET", dataUrl, false); // false makes the request synchronous
120+
xhr.send();
121+
122+
if (xhr.status === 200) {
123+
var results = JSON.parse(xhr.responseText);
124+
data = new Map(Object.entries(results));
125+
// convert each value to Map
126+
data.forEach((value, modelId) => {
127+
data.set(modelId, new Map(Object.entries(value)));
128+
});
129+
data.forEach((value, modelId) => {
130+
// add link to model
131+
if (modelId.includes("--")) {
132+
modelId = modelId.split("--");
133+
modelOrg = modelId[0];
134+
modelId = modelId[1];
135+
url = hfLinkPrefix + modelOrg + "/" + modelId;
136+
linkMapping.set(modelId, url);
137+
} else if (modelId.startsWith("gpt-4-")) {
138+
linkMapping.set(
139+
modelId,
140+
"https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
141+
);
142+
} else if (modelId.startsWith("gpt-3.5-")) {
143+
linkMapping.set(
144+
modelId,
145+
"https://platform.openai.com/docs/models/gpt-3-5-turbo",
146+
);
147+
} else if (modelId.startsWith("claude-3-")) {
148+
linkMapping.set(
149+
modelId,
150+
"https://www.anthropic.com/news/claude-3-family",
151+
);
152+
} else if (modelId.startsWith("gemini-1.5-pro")) {
153+
linkMapping.set(
154+
modelId,
155+
"https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#sundar-note",
156+
);
157+
} else if (modelId.startsWith("gemini-1.5-flash")) {
158+
linkMapping.set(
159+
modelId,
160+
"https://deepmind.google/technologies/gemini/flash/",
161+
);
162+
} else if (modelId.startsWith("gpt-4o-")) {
163+
linkMapping.set(modelId, "https://openai.com/index/hello-gpt-4o/");
164+
} else if (modelId.startsWith("deepseek-chat")) {
165+
linkMapping.set(modelId, "https://chat.deepseek.com/")
166+
}
167+
});
168+
} else {
169+
alert(
170+
"Failed to load data from " + dataUrl + ". Please try again later.",
171+
);
172+
}
173+
const globalData = data;
174+
const winrate_tag = "🏆 Win Rate (%)";
175+
176+
// each row represents a model
177+
const theaders = [
178+
"#", // rank
179+
"Model", // model name
180+
"DPS",
181+
// "DPS Norm",
182+
"pass@1",
183+
winrate_tag, // computed over the same set of passing solutions
184+
];
185+
186+
const displayTable = (table) => {
187+
var thead = document.createElement("thead");
188+
var headerRow = document.createElement("tr");
189+
// headers
190+
theaders.forEach(function (header) {
191+
var th = document.createElement("th");
192+
th.classList.add("text-nowrap");
193+
th.textContent = header;
194+
195+
if (header == winrate_tag) {
196+
th.style.backgroundColor = "#EEFFEE";
197+
}
198+
199+
headerRow.appendChild(th);
200+
});
201+
thead.appendChild(headerRow);
202+
table.appendChild(thead);
203+
204+
// convert data to array of Map
205+
data = Array.from(globalData);
206+
data = data.map(
207+
([modelId, value]) => new Map([["modelId", modelId], ...value]),
208+
)
209+
data.sort((a, b) => b.get("win_rate") - a.get("win_rate"));
210+
211+
var tbody = document.createElement("tbody");
212+
// add rank
213+
var rank = 0;
214+
var last_best = null;
215+
var n_last_best = 1;
216+
data.forEach((row) => {
217+
var dataRow = document.createElement("tr");
218+
// rank
219+
var rankCell = document.createElement("td");
220+
dataRow.appendChild(rankCell);
221+
var modelCell = document.createElement("td");
222+
var modelLink = document.createElement("a");
223+
var modelId = row.get('modelId');
224+
var modelName = modelId;
225+
if (modelId.includes("--")) {
226+
modelName = modelId.split("--")[1];
227+
}
228+
var cur_win_rate = row.get('win_rate').toFixed(3);
229+
if (last_best != cur_win_rate) {
230+
rank += n_last_best;
231+
last_best = cur_win_rate;
232+
rankCell.textContent = rank;
233+
n_last_best = 1;
234+
} else {
235+
n_last_best += 1;
236+
}
237+
if (rank == 1) {
238+
modelLink.textContent = "🥇 " + modelName;
239+
} else if (rank == 2) {
240+
modelLink.textContent = "🥈 " + modelName;
241+
} else if (rank == 3) {
242+
modelLink.textContent = "🥉 " + modelName;
243+
} else {
244+
modelLink.textContent = modelName;
245+
}
246+
if (linkMapping.has(modelName)) {
247+
modelLink.href = linkMapping.get(modelName);
248+
}
249+
modelLink.classList.add("link-underline-primary");
250+
modelLink.classList.add("text-nowrap");
251+
modelCell.appendChild(modelLink);
252+
dataRow.appendChild(modelCell);
253+
dpsRow = document.createElement("td");
254+
dpsRow.textContent = row.get("dps").toFixed(1);
255+
dataRow.appendChild(dpsRow);
256+
// dpsNormRow = document.createElement("td");
257+
// dpsNormRow.textContent = row.get("dps_norm").toFixed(1);
258+
// dataRow.appendChild(dpsNormRow);
259+
passRow = document.createElement("td");
260+
passRow.textContent = row.get("pass@1").toFixed(1);
261+
dataRow.appendChild(passRow);
262+
winRateRow = document.createElement("td");
263+
winRateRow.textContent = (row.get('win_rate') * 100).toFixed(1);
264+
winRateRow.style.backgroundColor = "#EEFFEE";
265+
dataRow.appendChild(winRateRow);
266+
tbody.appendChild(dataRow);
267+
});
268+
table.appendChild(tbody);
269+
};
270+
271+
const clearTable = () => {
272+
contextTable.innerHTML = "";
273+
};
274+
275+
const main = () => {
276+
clearTable();
277+
displayTable(contextTable);
278+
};
279+
280+
main();
281+
</script>
282+
</body>
283+
284+
</html>

0 commit comments

Comments
 (0)