From b319a8df17bce5e9b18a9301ec07e7c7942c03b6 Mon Sep 17 00:00:00 2001 From: Daeseong Kim <16497919+dkimds@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:50:14 +0900 Subject: [PATCH] Add debug mode to 5 benchmarks AIME24, AIME25, AIW, AMC23, HMMT, MATH500 --- eval/chat_benchmarks/AIME24/eval_instruct.py | 5 +++++ eval/chat_benchmarks/AIME25/eval_instruct.py | 5 +++++ eval/chat_benchmarks/AIW/eval_instruct.py | 5 +++++ eval/chat_benchmarks/AMC23/eval_instruct.py | 5 +++++ eval/chat_benchmarks/HMMT/eval_instruct.py | 5 +++++ eval/chat_benchmarks/MATH500/eval_instruct.py | 5 +++++ 6 files changed, 30 insertions(+) diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py index 2a96b88c..47ca5309 100644 --- a/eval/chat_benchmarks/AIME24/eval_instruct.py +++ b/eval/chat_benchmarks/AIME24/eval_instruct.py @@ -161,6 +161,11 @@ def load_questions(self) -> List[Dict[str, str]]: """Load AIME24 questions from the data file.""" with open(self.data_file, "r") as f: questions = [json.loads(x) for x in f] + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}") return questions diff --git a/eval/chat_benchmarks/AIME25/eval_instruct.py b/eval/chat_benchmarks/AIME25/eval_instruct.py index 9bfeaf02..61cb9fad 100644 --- a/eval/chat_benchmarks/AIME25/eval_instruct.py +++ b/eval/chat_benchmarks/AIME25/eval_instruct.py @@ -158,6 +158,11 @@ def load_questions(self) -> List[Dict[str, str]]: """Load AIME25 questions from the data file.""" with open(self.data_file, "r") as f: questions = [json.loads(x) for x in f] + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}") return questions diff --git a/eval/chat_benchmarks/AIW/eval_instruct.py b/eval/chat_benchmarks/AIW/eval_instruct.py index c3b54c86..90f2fb16 100644 --- a/eval/chat_benchmarks/AIW/eval_instruct.py +++ b/eval/chat_benchmarks/AIW/eval_instruct.py @@ -112,6 +112,11 @@ def load_questions(self) -> List[Dict[str, str]]: """Load AIW questions from the data file.""" with open(self.data_file, "r") as f: questions = json.load(f) + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}") return questions diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py index 094c77b7..a26fcdaf 100644 --- a/eval/chat_benchmarks/AMC23/eval_instruct.py +++ b/eval/chat_benchmarks/AMC23/eval_instruct.py @@ -168,6 +168,11 @@ def load_questions(self) -> List[Dict[str, str]]: """Load AMC23 questions from the data file.""" with open(self.data_file, "r") as f: questions = [json.loads(x) for x in f] + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}") return questions diff --git a/eval/chat_benchmarks/HMMT/eval_instruct.py b/eval/chat_benchmarks/HMMT/eval_instruct.py index 22f76801..99f10146 100644 --- a/eval/chat_benchmarks/HMMT/eval_instruct.py +++ b/eval/chat_benchmarks/HMMT/eval_instruct.py @@ -168,5 +168,10 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]: def load_questions(self) -> List[Dict[str, str]]: """Load HMMT questions from the data file.""" dataset = load_dataset(self.dataset_name, split="train") + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + questions = [dict(example) for example in dataset] return questions diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py index eec964f0..2d4a0d39 100644 --- a/eval/chat_benchmarks/MATH500/eval_instruct.py +++ b/eval/chat_benchmarks/MATH500/eval_instruct.py @@ -131,6 +131,11 @@ def load_questions(self) -> List[Dict[str, str]]: """Load MATH500 questions from the data file.""" with open(self.data_file, "r") as f: questions = [json.loads(x) for x in f] + + if self.debug: + questions = questions[:2] + self.logger.info(f"Debug mode enabled. Using only {len(questions)} questions.") + self.logger.info(f"Loaded {len(questions)} questions from {self.data_file}") return questions