mlfoundations · dkimds · Jul 1, 2025
diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -161,6 +161,11 @@ def load_questions(self) -> List[Dict[str, str]]:
         """Load AIME24 questions from the data file."""
         with open(self.data_file, "r") as f:
             questions = [json.loads(x) for x in f]
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}")
         return questions
 

diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py
@@ -158,6 +158,11 @@ def load_questions(self) -> List[Dict[str, str]]:
         """Load AIME25 questions from the data file."""
         with open(self.data_file, "r") as f:
             questions = [json.loads(x) for x in f]
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}")
         return questions
 

diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py
@@ -112,6 +112,11 @@ def load_questions(self) -> List[Dict[str, str]]:
         """Load AIW questions from the data file."""
         with open(self.data_file, "r") as f:
             questions = json.load(f)
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}")
         return questions
 

diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -168,6 +168,11 @@ def load_questions(self) -> List[Dict[str, str]]:
         """Load AMC23 questions from the data file."""
         with open(self.data_file, "r") as f:
             questions = [json.loads(x) for x in f]
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}")
         return questions
 

diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py
@@ -168,5 +168,10 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]:
     def load_questions(self) -> List[Dict[str, str]]:
         """Load HMMT questions from the data file."""
         dataset = load_dataset(self.dataset_name, split="train")
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         questions = [dict(example) for example in dataset]
         return questions
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -131,6 +131,11 @@ def load_questions(self) -> List[Dict[str, str]]:
         """Load MATH500 questions from the data file."""
         with open(self.data_file, "r") as f:
             questions = [json.loads(x) for x in f]
+
+        if self.debug:
+            questions = questions[:2]
+            self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.")
+
         self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}")
         return questions