default max tokens (#128)

jmercat · neginraoof · web-flow · commit 6cd745f74a23 · 2025-06-05T01:57:28.000-07:00
* fix a few benchmark such that importing any of them works properly

* ran black formatter

* ran black[colorama]==23.1.0 formatting

* hand formatted last issue

* set back default max_tokens when none

* By default don't pass None arguments to benchmark init (which uses the default value instead of None). Allows to avoid special handling of max_tokens argument

---------

Co-authored-by: Negin Raoof &lt;neginmr@utexas.edu&gt;
diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIME24/data/aime24.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -44,9 +44,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -23,7 +23,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AIW/data/aiw_data.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
         n_trials: int = 100,  # Run 100 trials
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_trials = n_trials
 
diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/AMC23/data/amc23.json",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/BigCodeBench/eval_instruct.py b/eval/chat_benchmarks/BigCodeBench/eval_instruct.py
@@ -70,7 +70,7 @@ def __init__(
         self,
         language: str = "python",
         data_dir: str = BIGCODEBENCH_PATH,
-        max_tokens: Optional[int] = 1280,
+        max_tokens: int = 1280,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -98,7 +98,7 @@ def __init__(
         self.language = language
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1280
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/CodeElo/eval_instruct.py b/eval/chat_benchmarks/CodeElo/eval_instruct.py
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,9 +63,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,8 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
-        self.seed = seed
+        self.max_new_tokens = max_tokens
         self.n_repeat = 3
         self.filter_interaction_questions = True
 
diff --git a/eval/chat_benchmarks/CruxEval/eval_instruct.py b/eval/chat_benchmarks/CruxEval/eval_instruct.py
@@ -132,7 +132,7 @@ class CruxEvalBenchmark(BaseBenchmark):
     def __init__(
         self,
         data_dir: str = CruxEval_PATH,
-        max_tokens: Optional[int] = 2048,
+        max_tokens: int = 2048,
         num_workers: int = 32,
         timeout: float = 120,
         debug: bool = False,
@@ -155,7 +155,7 @@ def __init__(
         self.language = "python"
         os.makedirs(data_dir, exist_ok=True)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 2048
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
@@ -35,7 +35,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -51,7 +51,7 @@ def __init__(
         self.dataset_name = "Idavidrein/gpqa"
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
+        self.max_new_tokens = max_tokens
         self.n_repeat = 3
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
diff --git a/eval/chat_benchmarks/HLE/eval_instruct.py b/eval/chat_benchmarks/HLE/eval_instruct.py
@@ -63,7 +63,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -77,9 +77,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -29,7 +29,7 @@ def __init__(
         self,
         dataset_name: str = "MathArena/hmmt_feb_2025",
         debug: bool = False,
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         seed: List[int] = [0, 1234, 1234, 1234],
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
@@ -47,9 +47,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.dataset_name = dataset_name
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 10
 
diff --git a/eval/chat_benchmarks/HumanEval/eval_instruct.py b/eval/chat_benchmarks/HumanEval/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python", "sh"],
         data_dir: str = "eval/chat_benchmarks/HumanEval/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py b/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py
@@ -22,7 +22,7 @@ def __init__(
         self,
         languages: List[str] = ["python"],
         data_dir: str = "eval/chat_benchmarks/HumanEvalPlus/data",
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
@@ -45,7 +45,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/IFEval/eval_instruct.py b/eval/chat_benchmarks/IFEval/eval_instruct.py
@@ -18,7 +18,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -37,7 +37,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/JEEBench/eval_instruct.py b/eval/chat_benchmarks/JEEBench/eval_instruct.py
@@ -78,7 +78,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -92,9 +92,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/LiveBench/eval_instruct.py b/eval/chat_benchmarks/LiveBench/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         release_date: str = "2024-08-31",
         remove_existing_file: bool = True,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 4096,
+        max_tokens: int = 4096,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -72,7 +72,7 @@ def __init__(
             self.release_date = "2024-06-24"
             self.num_workers = 1
         else:
-            self.max_tokens = max_tokens if max_tokens is not None else 4096
+            self.max_tokens = max_tokens
         self.temperature = temperature
         self.num_choices = num_choices
         self.all_release_dates = ["2024-07-26", "2024-06-24", "2024-08-31", "2024-11-25"]
diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -51,7 +51,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -66,7 +66,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens or 32768
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 6
 
diff --git a/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py
@@ -47,7 +47,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = None,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
         self.seed = seed
         self.n_repeat = 3
 
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -27,7 +27,7 @@ def __init__(
         data_file: str = "eval/chat_benchmarks/MATH500/data/math500.jsonl",
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int] = 32768,
+        max_tokens: int = 32768,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,9 +45,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = max_tokens
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
         """
diff --git a/eval/chat_benchmarks/MBPP/eval_instruct.py b/eval/chat_benchmarks/MBPP/eval_instruct.py
@@ -26,7 +26,7 @@ def __init__(
         start_idx: int = 10,
         end_idx: int = 510,
         debug: bool = False,
-        max_tokens: Optional[int] = 512,
+        max_tokens: int = 512,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -45,7 +45,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens or 512
+        self.max_tokens = max_tokens
         self.num_examples = num_examples
         self.start_idx = start_idx
         self.end_idx = end_idx
diff --git a/eval/chat_benchmarks/MBPPPlus/eval_instruct.py b/eval/chat_benchmarks/MBPPPlus/eval_instruct.py
@@ -25,7 +25,7 @@ def __init__(
         num_workers: int = 8,
         timeout: float = 3.0,
         debug: bool = False,
-        max_tokens: Optional[int] = 1024,
+        max_tokens: int = 1024,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -43,7 +43,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_dir = data_dir
-        self.max_tokens = max_tokens if max_tokens is not None else 1024
+        self.max_tokens = max_tokens
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py
diff --git a/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py b/eval/chat_benchmarks/MTBench/fastchat/protocol/openai_api_protocol.py
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py b/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py
diff --git a/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py b/eval/chat_benchmarks/MTBench/fastchat/serve/openai_api_server.py
diff --git a/eval/chat_benchmarks/WildBench/eval_instruct.py b/eval/chat_benchmarks/WildBench/eval_instruct.py
diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/decoders/cohere.py
diff --git a/eval/task.py b/eval/task.py