Skip to content

Commit 6cd745f

Browse files
jmercatneginraoof
andauthored
default max tokens (#128)
* fix a few benchmark such that importing any of them works properly * ran black formatter * ran black[colorama]==23.1.0 formatting * hand formatted last issue * set back default max_tokens when none * By default don't pass None arguments to benchmark init (which uses the default value instead of None). Allows to avoid special handling of max_tokens argument --------- Co-authored-by: Negin Raoof <neginmr@utexas.edu>
1 parent cc75611 commit 6cd745f

File tree

29 files changed

+71
-79
lines changed

29 files changed

+71
-79
lines changed

eval/chat_benchmarks/AIME24/eval_instruct.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(
2727
data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json",
2828
debug: bool = False,
2929
seed: List[int] = [0, 1234, 1234, 1234],
30-
max_tokens: Optional[int] = 32768,
30+
max_tokens: int = 32768,
3131
logger: Optional[logging.Logger] = None,
3232
system_instruction: Optional[str] = None,
3333
):
@@ -44,9 +44,7 @@ def __init__(
4444
super().__init__(logger=logger, system_instruction=system_instruction)
4545
self.data_file = data_file
4646
self.debug = debug
47-
self.max_new_tokens = (
48-
max_tokens if max_tokens is not None else 32768
49-
) # set higher to avoid truncation for reasoning models
47+
self.max_new_tokens = max_tokens
5048
self.seed = seed
5149
self.n_repeat = 10
5250

eval/chat_benchmarks/AIME25/eval_instruct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __init__(
4343
super().__init__(logger=logger, system_instruction=system_instruction)
4444
self.data_file = data_file
4545
self.debug = debug
46-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
46+
self.max_new_tokens = max_tokens
4747
self.seed = seed
4848
self.n_repeat = 10
4949

eval/chat_benchmarks/AIW/eval_instruct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def __init__(
2323
data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json",
2424
debug: bool = False,
2525
seed: List[int] = [0, 1234, 1234, 1234],
26-
max_tokens: Optional[int] = 32768,
26+
max_tokens: int = 32768,
2727
logger: Optional[logging.Logger] = None,
2828
system_instruction: Optional[str] = None,
2929
n_trials: int = 100, # Run 100 trials
@@ -41,7 +41,7 @@ def __init__(
4141
super().__init__(logger=logger, system_instruction=system_instruction)
4242
self.data_file = data_file
4343
self.debug = debug
44-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
44+
self.max_new_tokens = max_tokens
4545
self.seed = seed
4646
self.n_trials = n_trials
4747

eval/chat_benchmarks/AMC23/eval_instruct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(
2929
data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json",
3030
debug: bool = False,
3131
seed: List[int] = [0, 1234, 1234, 1234],
32-
max_tokens: Optional[int] = 32768,
32+
max_tokens: int = 32768,
3333
logger: Optional[logging.Logger] = None,
3434
system_instruction: Optional[str] = None,
3535
):
@@ -47,7 +47,7 @@ def __init__(
4747
self.data_file = data_file
4848
self.debug = debug
4949
self.seed = seed
50-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
50+
self.max_new_tokens = max_tokens
5151
self.n_repeat = 10
5252

5353
def generate_responses(self, model: LM) -> Dict[str, Any]:

eval/chat_benchmarks/BigCodeBench/eval_instruct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def __init__(
7070
self,
7171
language: str = "python",
7272
data_dir: str = BIGCODEBENCH_PATH,
73-
max_tokens: Optional[int] = 1280,
73+
max_tokens: int = 1280,
7474
num_workers: int = 32,
7575
timeout: float = 120,
7676
debug: bool = False,
@@ -98,7 +98,7 @@ def __init__(
9898
self.language = language
9999
os.makedirs(data_dir, exist_ok=True)
100100
self.data_dir = data_dir
101-
self.max_tokens = max_tokens if max_tokens is not None else 1280
101+
self.max_tokens = max_tokens
102102
self.num_workers = num_workers
103103
self.timeout = timeout
104104
self.debug = debug

eval/chat_benchmarks/CodeElo/eval_instruct.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(
4848
self,
4949
debug: bool = False,
5050
seed: List[int] = [0, 1234, 1234, 1234],
51-
max_tokens: Optional[int] = None,
51+
max_tokens: int = 32768,
5252
logger: Optional[logging.Logger] = None,
5353
system_instruction: Optional[str] = None,
5454
):
@@ -63,9 +63,7 @@ def __init__(
6363
"""
6464
super().__init__(logger=logger, system_instruction=system_instruction)
6565
self.debug = debug
66-
self.max_new_tokens = (
67-
max_tokens if max_tokens is not None else 32768
68-
) # set higher to avoid truncation for reasoning models
66+
self.max_new_tokens = max_tokens
6967
self.seed = seed
7068
self.n_repeat = 3
7169
self.filter_interaction_questions = True

eval/chat_benchmarks/CodeForces/eval_instruct.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
self,
4848
debug: bool = False,
4949
seed: List[int] = [0, 1234, 1234, 1234],
50-
max_tokens: Optional[int] = None,
50+
max_tokens: int = 32768,
5151
logger: Optional[logging.Logger] = None,
5252
system_instruction: Optional[str] = None,
5353
):
@@ -62,8 +62,7 @@ def __init__(
6262
"""
6363
super().__init__(logger=logger, system_instruction=system_instruction)
6464
self.debug = debug
65-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
66-
self.seed = seed
65+
self.max_new_tokens = max_tokens
6766
self.n_repeat = 3
6867
self.filter_interaction_questions = True
6968

eval/chat_benchmarks/CruxEval/eval_instruct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark):
132132
def __init__(
133133
self,
134134
data_dir: str = CruxEval_PATH,
135-
max_tokens: Optional[int] = 2048,
135+
max_tokens: int = 2048,
136136
num_workers: int = 32,
137137
timeout: float = 120,
138138
debug: bool = False,
@@ -155,7 +155,7 @@ def __init__(
155155
self.language = "python"
156156
os.makedirs(data_dir, exist_ok=True)
157157
self.data_dir = data_dir
158-
self.max_tokens = max_tokens if max_tokens is not None else 2048
158+
self.max_tokens = max_tokens
159159
self.num_workers = num_workers
160160
self.timeout = timeout
161161
self.debug = debug

eval/chat_benchmarks/GPQADiamond/eval_instruct.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
self,
3636
debug: bool = False,
3737
seed: List[int] = [0, 1234, 1234, 1234],
38-
max_tokens: Optional[int] = 32768,
38+
max_tokens: int = 32768,
3939
logger: Optional[logging.Logger] = None,
4040
system_instruction: Optional[str] = None,
4141
):
@@ -51,7 +51,7 @@ def __init__(
5151
self.dataset_name = "Idavidrein/gpqa"
5252
self.debug = debug
5353
self.seed = seed
54-
self.max_new_tokens = max_tokens if max_tokens is not None else 32768
54+
self.max_new_tokens = max_tokens
5555
self.n_repeat = 3
5656

5757
def generate_responses(self, model: LM) -> Dict[str, Any]:

eval/chat_benchmarks/HLE/eval_instruct.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __init__(
6363
self,
6464
debug: bool = False,
6565
seed: List[int] = [0, 1234, 1234, 1234],
66-
max_tokens: Optional[int] = None,
66+
max_tokens: int = 32768,
6767
logger: Optional[logging.Logger] = None,
6868
system_instruction: Optional[str] = None,
6969
):
@@ -77,9 +77,7 @@ def __init__(
7777
"""
7878
super().__init__(logger=logger, system_instruction=system_instruction)
7979
self.debug = debug
80-
self.max_new_tokens = (
81-
max_tokens if max_tokens is not None else 32768
82-
) # set higher to avoid truncation for reasoning models
80+
self.max_new_tokens = max_tokens
8381
self.seed = seed
8482
self.n_repeat = 3
8583

0 commit comments

Comments
 (0)