Skip to content

Commit 8920704

Browse files
committed
repoqa: update models and add ack
1 parent 323aca4 commit 8920704

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

repoqa.html

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,28 @@ <h3 id="limit" class="text-nowrap mt-5">Known limitations</h3>
337337
(Thanks @chrisgorgo for the suggestion!)
338338
</li>
339339
</ul>
340+
<h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
341+
<p>
342+
Running long-context evaluations can be costly -- we thank
343+
<a href="https://deepmind.google/">Google DeepMind</a>
344+
and
345+
<a href="https://openai.com/form/researcher-access-program/"
346+
>OpenAI Researcher Access Program</a
347+
>
348+
for their generous API credits!
349+
</p>
350+
<p>
351+
Meanwhile, note that RepoQA is a transparent research project
352+
started by students at UIUC. We assure the reproducibility and
353+
fairness of the evaluation as well as the indenpendence of our
354+
benchmark design that none of these will be optimized or compromised
355+
for models from specific organizations. The outputs and results of
356+
evaluated models can be found at our
357+
<a
358+
href="https://github.com/evalplus/repoqa/releases/tag/dev-results"
359+
>GitHub release page</a
360+
>.
361+
</p>
340362
</div>
341363
</div>
342364
</div>

results/repoqa/COMBINED-RESULTS.json

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,6 +2015,90 @@
20152015
}
20162016
}
20172017
},
2018+
"microsoft/Phi-3-small-128k-instruct": {
2019+
"eval_date": "2024-05-23 02:31:23.338528",
2020+
"train_size": "128k",
2021+
"scores": {
2022+
"all": {
2023+
"0.0": { "pass@1": 0.792 },
2024+
"0.1": { "pass@1": 0.662 },
2025+
"0.2": { "pass@1": 0.612 },
2026+
"0.3": { "pass@1": 0.564 },
2027+
"0.4": { "pass@1": 0.526 },
2028+
"0.5": { "pass@1": 0.49 },
2029+
"0.6": { "pass@1": 0.466 },
2030+
"0.7": { "pass@1": 0.444 },
2031+
"0.8": { "pass@1": 0.396 },
2032+
"0.9": { "pass@1": 0.322 },
2033+
"1.0": { "pass@1": 0.242 }
2034+
},
2035+
"python": {
2036+
"0.0": { "pass@1": 0.74 },
2037+
"0.1": { "pass@1": 0.57 },
2038+
"0.2": { "pass@1": 0.52 },
2039+
"0.3": { "pass@1": 0.46 },
2040+
"0.4": { "pass@1": 0.41 },
2041+
"0.5": { "pass@1": 0.36 },
2042+
"0.6": { "pass@1": 0.34 },
2043+
"0.7": { "pass@1": 0.29 },
2044+
"0.8": { "pass@1": 0.25 },
2045+
"0.9": { "pass@1": 0.23 },
2046+
"1.0": { "pass@1": 0.21 }
2047+
},
2048+
"cpp": {
2049+
"0.0": { "pass@1": 0.83 },
2050+
"0.1": { "pass@1": 0.68 },
2051+
"0.2": { "pass@1": 0.62 },
2052+
"0.3": { "pass@1": 0.57 },
2053+
"0.4": { "pass@1": 0.55 },
2054+
"0.5": { "pass@1": 0.53 },
2055+
"0.6": { "pass@1": 0.52 },
2056+
"0.7": { "pass@1": 0.5 },
2057+
"0.8": { "pass@1": 0.48 },
2058+
"0.9": { "pass@1": 0.4 },
2059+
"1.0": { "pass@1": 0.35 }
2060+
},
2061+
"java": {
2062+
"0.0": { "pass@1": 0.83 },
2063+
"0.1": { "pass@1": 0.74 },
2064+
"0.2": { "pass@1": 0.69 },
2065+
"0.3": { "pass@1": 0.67 },
2066+
"0.4": { "pass@1": 0.6 },
2067+
"0.5": { "pass@1": 0.58 },
2068+
"0.6": { "pass@1": 0.55 },
2069+
"0.7": { "pass@1": 0.53 },
2070+
"0.8": { "pass@1": 0.46 },
2071+
"0.9": { "pass@1": 0.39 },
2072+
"1.0": { "pass@1": 0.3 }
2073+
},
2074+
"typescript": {
2075+
"0.0": { "pass@1": 0.9 },
2076+
"0.1": { "pass@1": 0.81 },
2077+
"0.2": { "pass@1": 0.78 },
2078+
"0.3": { "pass@1": 0.7 },
2079+
"0.4": { "pass@1": 0.68 },
2080+
"0.5": { "pass@1": 0.62 },
2081+
"0.6": { "pass@1": 0.58 },
2082+
"0.7": { "pass@1": 0.56 },
2083+
"0.8": { "pass@1": 0.49 },
2084+
"0.9": { "pass@1": 0.34 },
2085+
"1.0": { "pass@1": 0.14 }
2086+
},
2087+
"rust": {
2088+
"0.0": { "pass@1": 0.66 },
2089+
"0.1": { "pass@1": 0.51 },
2090+
"0.2": { "pass@1": 0.45 },
2091+
"0.3": { "pass@1": 0.42 },
2092+
"0.4": { "pass@1": 0.39 },
2093+
"0.5": { "pass@1": 0.36 },
2094+
"0.6": { "pass@1": 0.34 },
2095+
"0.7": { "pass@1": 0.34 },
2096+
"0.8": { "pass@1": 0.3 },
2097+
"0.9": { "pass@1": 0.25 },
2098+
"1.0": { "pass@1": 0.21 }
2099+
}
2100+
}
2101+
},
20182102
"gemini-1.5-flash-latest": {
20192103
"eval_date": "2024-05-19 04:32:12.200298",
20202104
"train_size": "1000k",

0 commit comments

Comments
 (0)