Skip to content

Commit cc75611

Browse files
authored
fix a few benchmark such that importing any of them works properly (#127)
* fix a few benchmark such that importing any of them works properly * ran black formatter * ran black[colorama]==23.1.0 formatting * hand formatted last issue * set back default max_tokens when none
1 parent 81180cb commit cc75611

File tree

77 files changed

+242
-258
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+242
-258
lines changed

eval/chat_benchmarks/AIME24/eval_instruct.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ def __init__(
4444
super().__init__(logger=logger, system_instruction=system_instruction)
4545
self.data_file = data_file
4646
self.debug = debug
47-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
47+
self.max_new_tokens = (
48+
max_tokens if max_tokens is not None else 32768
49+
) # set higher to avoid truncation for reasoning models
4850
self.seed = seed
4951
self.n_repeat = 10
5052

eval/chat_benchmarks/AIME25/eval_instruct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __init__(
4343
super().__init__(logger=logger, system_instruction=system_instruction)
4444
self.data_file = data_file
4545
self.debug = debug
46-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
46+
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
4747
self.seed = seed
4848
self.n_repeat = 10
4949

eval/chat_benchmarks/AIW/eval_instruct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(
4141
super().__init__(logger=logger, system_instruction=system_instruction)
4242
self.data_file = data_file
4343
self.debug = debug
44-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
44+
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
4545
self.seed = seed
4646
self.n_trials = n_trials
4747

eval/chat_benchmarks/AMC23/eval_instruct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
self.data_file = data_file
4848
self.debug = debug
4949
self.seed = seed
50-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
50+
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
5151
self.n_repeat = 10
5252

5353
def generate_responses(self, model: LM) -> Dict[str, Any]:

eval/chat_benchmarks/BigCodeBench/execution.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,7 @@ def check_correctness(
6060
def unsafe_execute(tmp_dir):
6161
random_id = random.randint(1, 100000)
6262
if "python" in language_type.lower():
63-
6463
with create_tempdir():
65-
6664
# These system calls are needed when cleaning up tempdir.
6765
import os
6866
import shutil

eval/chat_benchmarks/CodeElo/codeelo_utils.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@
1818

1919
import scipy.stats as stats
2020

21+
2122
def rating_to_difficulty(rating):
2223
if rating < 1000:
23-
return 'Easy'
24+
return "Easy"
2425
if rating < 1300:
25-
return 'Medium'
26+
return "Medium"
2627
if rating <= 3500:
27-
return 'Hard'
28+
return "Hard"
29+
30+
return "Easy"
2831

29-
return 'Easy'
3032

3133
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
3234
"""
@@ -250,11 +252,13 @@ def codeelo_run(problem, completion, timeout, is_extracted):
250252
outs = tc[1]
251253
testtype = "stdin"
252254

253-
test_cases.append({
254-
"input": ins,
255-
"output": outs,
256-
"testtype": testtype,
257-
})
255+
test_cases.append(
256+
{
257+
"input": ins,
258+
"output": outs,
259+
"testtype": testtype,
260+
}
261+
)
258262

259263
manager = multiprocessing.Manager()
260264
result = manager.list()

eval/chat_benchmarks/CodeElo/eval_instruct.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(
4848
self,
4949
debug: bool = False,
5050
seed: List[int] = [0, 1234, 1234, 1234],
51-
max_tokens: Optional[int] = None,
51+
max_tokens: Optional[int] = None,
5252
logger: Optional[logging.Logger] = None,
5353
system_instruction: Optional[str] = None,
5454
):
@@ -63,7 +63,9 @@ def __init__(
6363
"""
6464
super().__init__(logger=logger, system_instruction=system_instruction)
6565
self.debug = debug
66-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768 # set higher to avoid truncation for reasoning models
66+
self.max_new_tokens = (
67+
max_tokens if max_tokens is not None else 32768
68+
) # set higher to avoid truncation for reasoning models
6769
self.seed = seed
6870
self.n_repeat = 3
6971
self.filter_interaction_questions = True

eval/chat_benchmarks/CodeForces/codeforces_utils.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,19 @@
1818

1919
import scipy.stats as stats
2020

21+
2122
def rating_to_difficulty(rating):
2223
if not rating:
23-
return 'Easy'
24+
return "Easy"
2425
if rating < 1000:
25-
return 'Easy'
26+
return "Easy"
2627
if rating < 1300:
27-
return 'Medium'
28+
return "Medium"
2829
if rating <= 3500:
29-
return 'Hard'
30+
return "Hard"
31+
32+
return "Easy"
3033

31-
return 'Easy'
3234

3335
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
3436
"""
@@ -155,8 +157,8 @@ def run_test_std(completion, test_input, test_output):
155157
sys.stdin = io.StringIO(test_input)
156158
try:
157159
exec(f'__name__ = "__main__"\n{completion}' if '__name__ == "__main__"' in completion else completion, {})
158-
out = output.getvalue().strip().replace('\n',' ').replace('\r', '')
159-
expected = test_output.strip().replace('\n', ' ').replace('\r', '')
160+
out = output.getvalue().strip().replace("\n", " ").replace("\r", "")
161+
expected = test_output.strip().replace("\n", " ").replace("\r", "")
160162

161163
return out == expected, output.getvalue().strip()
162164
finally:
@@ -247,7 +249,6 @@ def run_tests_for_one_example(test_cases, completion, result_list, is_extracted)
247249
return
248250

249251

250-
251252
def codeforces_run(problem, completion, timeout, is_extracted):
252253
test_cases = problem["official_tests"]
253254
test_cases = [{**x, "testtype": "stdin"} for x in test_cases]

eval/chat_benchmarks/CodeForces/eval_instruct.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,7 @@ def __init__(
6262
"""
6363
super().__init__(logger=logger, system_instruction=system_instruction)
6464
self.debug = debug
65-
self.max_new_tokens = (
66-
max_tokens if max_tokens is not None else 32768
67-
)
65+
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
6866
self.seed = seed
6967
self.n_repeat = 3
7068
self.filter_interaction_questions = True

eval/chat_benchmarks/CruxEval/evaluation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def evaluate_generations(
7272
sample_jsonl = stream_jsonl_all(input_file)
7373

7474
with ThreadPoolExecutor(max_workers=n_workers) as executor:
75-
7675
futures = []
7776
completion_id = Counter()
7877
n_samples = 0

0 commit comments

Comments
 (0)