mlfoundations
diff --git a/‎eval/chat_benchmarks/AIME24/eval_instruct.py‎
Lines changed: 3 additions & 1 deletion b/‎eval/chat_benchmarks/AIME24/eval_instruct.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎eval/chat_benchmarks/AIME25/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion b/‎eval/chat_benchmarks/AIME25/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval/chat_benchmarks/AIW/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion b/‎eval/chat_benchmarks/AIW/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval/chat_benchmarks/AMC23/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion b/‎eval/chat_benchmarks/AMC23/eval_instruct.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eval/chat_benchmarks/BigCodeBench/execution.py‎
Lines changed: 0 additions & 2 deletions b/‎eval/chat_benchmarks/BigCodeBench/execution.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎eval/chat_benchmarks/CodeElo/codeelo_utils.py‎
Lines changed: 13 additions & 9 deletions b/‎eval/chat_benchmarks/CodeElo/codeelo_utils.py‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎eval/chat_benchmarks/CodeElo/eval_instruct.py‎
Lines changed: 4 additions & 2 deletions b/‎eval/chat_benchmarks/CodeElo/eval_instruct.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎eval/chat_benchmarks/CodeForces/codeforces_utils.py‎
Lines changed: 9 additions & 8 deletions b/‎eval/chat_benchmarks/CodeForces/codeforces_utils.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎eval/chat_benchmarks/CodeForces/eval_instruct.py‎
Lines changed: 1 addition & 3 deletions b/‎eval/chat_benchmarks/CodeForces/eval_instruct.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎eval/chat_benchmarks/CruxEval/evaluation.py‎
Lines changed: 0 additions & 1 deletion b/‎eval/chat_benchmarks/CruxEval/evaluation.py‎
Lines changed: 0 additions & 1 deletion
@@ -44,7 +44,9 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768    # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 10
 
 
@@ -43,7 +43,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 10
 
 
@@ -41,7 +41,7 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.data_file = data_file
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_trials = n_trials
 
 
@@ -47,7 +47,7 @@ def __init__(
         self.data_file = data_file
         self.debug = debug
         self.seed = seed
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.n_repeat = 10
 
     def generate_responses(self, model: LM) -> Dict[str, Any]:
 
@@ -60,9 +60,7 @@ def check_correctness(
     def unsafe_execute(tmp_dir):
         random_id = random.randint(1, 100000)
         if "python" in language_type.lower():
-
             with create_tempdir():
-
                 # These system calls are needed when cleaning up tempdir.
                 import os
                 import shutil
 
@@ -18,15 +18,17 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -250,11 +252,13 @@ def codeelo_run(problem, completion, timeout, is_extracted):
         outs = tc[1]
         testtype = "stdin"
 
-        test_cases.append({
-            "input": ins,
-            "output": outs,
-            "testtype": testtype,
-        })
+        test_cases.append(
+            {
+                "input": ins,
+                "output": outs,
+                "testtype": testtype,
+            }
+        )
 
     manager = multiprocessing.Manager()
     result = manager.list()
 
@@ -48,7 +48,7 @@ def __init__(
         self,
         debug: bool = False,
         seed: List[int] = [0, 1234, 1234, 1234],
-        max_tokens: Optional[int]  = None,
+        max_tokens: Optional[int] = None,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
@@ -63,7 +63,9 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = max_tokens if max_tokens is not None else 32768  # set higher to avoid truncation for reasoning models
+        self.max_new_tokens = (
+            max_tokens if max_tokens is not None else 32768
+        )  # set higher to avoid truncation for reasoning models
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
 
@@ -18,17 +18,19 @@
 
 import scipy.stats as stats
 
+
 def rating_to_difficulty(rating):
     if not rating:
-        return 'Easy'
+        return "Easy"
     if rating < 1000:
-        return 'Easy'
+        return "Easy"
     if rating < 1300:
-        return 'Medium'
+        return "Medium"
     if rating <= 3500:
-        return 'Hard'
+        return "Hard"
+
+    return "Easy"
 
-    return 'Easy'
 
 def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     """
@@ -155,8 +157,8 @@ def run_test_std(completion, test_input, test_output):
         sys.stdin = io.StringIO(test_input)
         try:
             exec(f'__name__ = "__main__"\n{completion}' if '__name__ == "__main__"' in completion else completion, {})
-            out = output.getvalue().strip().replace('\n',' ').replace('\r', '')
-            expected = test_output.strip().replace('\n', ' ').replace('\r', '')
+            out = output.getvalue().strip().replace("\n", " ").replace("\r", "")
+            expected = test_output.strip().replace("\n", " ").replace("\r", "")
 
             return out == expected, output.getvalue().strip()
         finally:
@@ -247,7 +249,6 @@ def run_tests_for_one_example(test_cases, completion, result_list, is_extracted)
             return
 
 
-
 def codeforces_run(problem, completion, timeout, is_extracted):
     test_cases = problem["official_tests"]
     test_cases = [{**x, "testtype": "stdin"} for x in test_cases]
 
@@ -62,9 +62,7 @@ def __init__(
         """
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
-        self.max_new_tokens = (
-            max_tokens if max_tokens is not None else 32768
-        )
+        self.max_new_tokens = max_tokens if max_tokens is not None else 32768
         self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
 
@@ -72,7 +72,6 @@ def evaluate_generations(
     sample_jsonl = stream_jsonl_all(input_file)
 
     with ThreadPoolExecutor(max_workers=n_workers) as executor:
-
         futures = []
         completion_id = Counter()
         n_samples = 0