From caa45700f99ecac1b0acf737358caa41f2c7a3db Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Mon, 14 Apr 2025 14:59:39 -0700 Subject: [PATCH 1/8] Add ARC-AGI-v1 dataset experiment pipeline --- .../arc_agi_templates/arc_agi_v1_basic.jinja | 3 + eureka_ml_insights/user_configs/__init__.py | 3 + eureka_ml_insights/user_configs/arc_agi.py | 94 +++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja create mode 100644 eureka_ml_insights/user_configs/arc_agi.py diff --git a/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja b/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja new file mode 100644 index 00000000..e70db518 --- /dev/null +++ b/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja @@ -0,0 +1,3 @@ +You are an intelligent assistant who is very good at answering test questions accurately. + +{{ prompt }} \ No newline at end of file diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py index ec419308..2224c461 100644 --- a/eureka_ml_insights/user_configs/__init__.py +++ b/eureka_ml_insights/user_configs/__init__.py @@ -5,6 +5,9 @@ AIME_PIPELINE, ) from .aime_seq import AIME_SEQ_PIPELINE +from .arc_agi import ( + ARC_AGI_v1_PIPELINE, +) from .ba_calendar import ( BA_Calendar_Parallel_PIPELINE, BA_Calendar_PIPELINE, diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py new file mode 100644 index 00000000..77ec1a6d --- /dev/null +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -0,0 +1,94 @@ +import os +from typing import Any + +from eureka_ml_insights.core import Inference, PromptProcessing +from eureka_ml_insights.core.data_processing import DataProcessing +from eureka_ml_insights.core.eval_reporting import EvalReporting +from eureka_ml_insights.data_utils.ba_calendar_utils import ( + BA_Calendar_ExtractAnswer, +) +from eureka_ml_insights.data_utils.data import ( + DataLoader, + DataReader, + HFDataReader, +) +from eureka_ml_insights.metrics.ba_calendar_metrics import BACalendarMetric +from eureka_ml_insights.metrics.reports import ( + AverageAggregator, + BiLevelCountAggregator, + BiLevelAggregator, + CountAggregator +) + +from eureka_ml_insights.data_utils.transform import ( + AddColumn, + AddColumnAndData, + ColumnRename, + CopyColumn, + ExtractUsageTransform, + MajorityVoteTransform, + MultiplyTransform, + RunPythonTransform, + SamplerTransform, + SequenceTransform, +) +from eureka_ml_insights.metrics.ba_calendar_metrics import BACalendarMetric + +from ..configs.config import ( + AggregatorConfig, + DataProcessingConfig, + DataSetConfig, + EvalReportingConfig, + InferenceConfig, + MetricConfig, + ModelConfig, + PipelineConfig, + PromptProcessingConfig, +) +from ..configs.experiment_config import ExperimentConfig + + +class ARC_AGI_v1_PIPELINE(ExperimentConfig): + """This class specifies the config for running any benchmark on any model""" + + def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir=None, **kwargs) -> PipelineConfig: + # data preprocessing + self.data_processing_comp = PromptProcessingConfig( + component_type=PromptProcessing, + prompt_template_path=os.path.join( + os.path.dirname(__file__), "../prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja" + ), + data_reader_config=DataSetConfig( + HFDataReader, + { + "path": "pxferna/ARC-AGI-v1", + "split": "test", + } + ), + output_dir=os.path.join(self.log_dir, "data_processing_output"), + ) + + # inference component + self.inference_comp = InferenceConfig( + component_type=Inference, + model_config=model_config, + data_loader_config=DataSetConfig( + DataLoader, + {"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")}, + ), + output_dir=os.path.join(self.log_dir, "inference_result"), + resume_from=resume_from, + max_concurrent=1, + ) + + if resume_logdir: + self.log_dir = resume_from.split("/")[0:len(resume_from.split("/")) - 1] + + # Configure the pipeline + return PipelineConfig( + [ + self.data_processing_comp, + self.inference_comp, + ], + self.log_dir, + ) From c6e6efda24c8bf197e6a7cfb5b67b33daba07410 Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Tue, 15 Apr 2025 12:02:47 -0700 Subject: [PATCH 2/8] Add metrics and aggregators for ARC AGI pipeline --- .../data_utils/arc_agi_utils.py | 39 +++++++++++++ eureka_ml_insights/user_configs/arc_agi.py | 58 ++++++++++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 eureka_ml_insights/data_utils/arc_agi_utils.py diff --git a/eureka_ml_insights/data_utils/arc_agi_utils.py b/eureka_ml_insights/data_utils/arc_agi_utils.py new file mode 100644 index 00000000..cd0392d3 --- /dev/null +++ b/eureka_ml_insights/data_utils/arc_agi_utils.py @@ -0,0 +1,39 @@ +import re +from dataclasses import dataclass + +import pandas as pd + +from .transform import DFTransformBase + + +@dataclass +class ARCAGI_ExtractAnswer(DFTransformBase): + model_output_column: str + model_answer_column: str + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[self.model_answer_column] = df[self.model_output_column].apply(self.parse_output_answer) + return df + + @staticmethod + def parse_output_answer(response): + """ + Parse the input string to extract answer of a given ARCAGI question. + Parameters: + response (str): Input string containing answer X in the form of "final answer string". + Returns: + answer (str): The final answer string with leading and training spaces stripped. + """ + answer = "" + + if response is None: + return "" + elif response.find("") == -1 or response.find("") == -1: + return "" + + start_index = response.find("") + len("") + end_index = response.find("") + + answer = response[start_index:end_index].strip() + + return answer diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py index 77ec1a6d..9abb980f 100644 --- a/eureka_ml_insights/user_configs/arc_agi.py +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -4,16 +4,17 @@ from eureka_ml_insights.core import Inference, PromptProcessing from eureka_ml_insights.core.data_processing import DataProcessing from eureka_ml_insights.core.eval_reporting import EvalReporting -from eureka_ml_insights.data_utils.ba_calendar_utils import ( - BA_Calendar_ExtractAnswer, +from eureka_ml_insights.data_utils.arc_agi_utils import ( + ARCAGI_ExtractAnswer, ) from eureka_ml_insights.data_utils.data import ( DataLoader, DataReader, HFDataReader, ) -from eureka_ml_insights.metrics.ba_calendar_metrics import BACalendarMetric +from eureka_ml_insights.metrics.metrics_base import ExactMatch from eureka_ml_insights.metrics.reports import ( + CountAggregator, AverageAggregator, BiLevelCountAggregator, BiLevelAggregator, @@ -84,11 +85,62 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= if resume_logdir: self.log_dir = resume_from.split("/")[0:len(resume_from.split("/")) - 1] + # Configure the evaluation and reporting component for evaluation and dataset level aggregation + self.evalreporting_comp = EvalReportingConfig( + component_type=EvalReporting, + data_reader_config=DataSetConfig( + DataReader, + { + "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"), + "format": ".jsonl", + "transform": SequenceTransform( + [ + ExtractUsageTransform(model_config), + ColumnRename( + name_mapping={ + "model_output": "raw_output", + } + ), + AddColumn("model_output"), + ARCAGI_ExtractAnswer("raw_output", "model_output"), + ] + ), + }, + ), + metric_config=MetricConfig(ExactMatch), + aggregator_configs=[ + AggregatorConfig( + CountAggregator, + { + "column_names": [ + "ExactMatch_result", + ], + "filename_base": "OverallMetrics_Separate_Runs_Grouped", + "normalize": True, + "group_by": "split", + }, + ), + # the next three reports take the average and std for all repeats + # the resulting numbers are the average and std of N pass@1 scores, where N is number of repeats + AggregatorConfig( + CountAggregator, + { + "column_names": [ + "ExactMatch_result", + ], + "normalize": True, + "filename_base": "OverallMetrics_Separate_Runs_Total", + }), + ], + output_dir=os.path.join(self.log_dir, "eval_report"), + ) + # Configure the pipeline return PipelineConfig( [ self.data_processing_comp, self.inference_comp, + self.evalreporting_comp, ], self.log_dir, ) From a8f0e4bec3f234eb4b85377391d8c9268ef71cbc Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Thu, 17 Apr 2025 08:23:07 -0700 Subject: [PATCH 3/8] Add ARC AGI 5run pipeline --- eureka_ml_insights/user_configs/__init__.py | 1 + eureka_ml_insights/user_configs/arc_agi.py | 82 +++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py index 2224c461..c2aee9f8 100644 --- a/eureka_ml_insights/user_configs/__init__.py +++ b/eureka_ml_insights/user_configs/__init__.py @@ -7,6 +7,7 @@ from .aime_seq import AIME_SEQ_PIPELINE from .arc_agi import ( ARC_AGI_v1_PIPELINE, + ARC_AGI_v1_PIPELINE_5Run, ) from .ba_calendar import ( BA_Calendar_Parallel_PIPELINE, diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py index 9abb980f..a723ff1a 100644 --- a/eureka_ml_insights/user_configs/arc_agi.py +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -29,6 +29,7 @@ ExtractUsageTransform, MajorityVoteTransform, MultiplyTransform, + ReplaceStringsTransform, RunPythonTransform, SamplerTransform, SequenceTransform, @@ -64,6 +65,11 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= { "path": "pxferna/ARC-AGI-v1", "split": "test", + "transform": SequenceTransform( + [ + MultiplyTransform(n_repeats=1), + ] + ), } ), output_dir=os.path.join(self.log_dir, "data_processing_output"), @@ -135,12 +141,88 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= output_dir=os.path.join(self.log_dir, "eval_report"), ) + self.posteval_data_post_processing_comp = DataProcessingConfig( + component_type=DataProcessing, + data_reader_config=DataSetConfig( + DataReader, + { + "path": os.path.join(self.evalreporting_comp.output_dir, "metric_results.jsonl"), + "format": ".jsonl", + "transform": SequenceTransform( + [ + CopyColumn( + column_name_src="ExactMatch_result", + column_name_dst="ExactMatch_result_numeric", + ), + ReplaceStringsTransform( + columns=["ExactMatch_result_numeric"], + mapping={'incorrect': '0', 'correct': '1', 'none': 'NaN'}, + case=False) + ] + ), + }, + ), + output_dir=os.path.join(self.log_dir, "posteval_data_post_processing_output"), + ) + + self.best_of_n_evalreporting_comp = EvalReportingConfig( + component_type=EvalReporting, + data_reader_config=DataSetConfig( + DataReader, + { + "path": os.path.join(self.posteval_data_post_processing_comp.output_dir, "transformed_data.jsonl"), + "format": ".jsonl" + }, + ), + aggregator_configs=[ + AggregatorConfig( + BiLevelAggregator, + { + "column_names": [ + "ExactMatch_result_numeric", + ], + "first_groupby": "uid", + "filename_base": "ExactMatch_Total_BestOfN", + }), + # the first three reports aggregate results by data_point_id and take the best out of N + AggregatorConfig( + BiLevelAggregator, + { + "column_names": [ + "ExactMatch_result_numeric" + ], + "first_groupby": "uid", + "second_groupby": "split", + "filename_base": "ExactMatch_Grouped_BestOfN", + "agg_fn": "max" + }, + ), + ], + output_dir=os.path.join(self.log_dir, "bestofn_eval_report"), + ) + # Configure the pipeline return PipelineConfig( [ self.data_processing_comp, self.inference_comp, self.evalreporting_comp, + self.posteval_data_post_processing_comp, + self.best_of_n_evalreporting_comp, ], self.log_dir, ) + + +class ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): + """This class specifies the config for running the GPQA benchmark 5 repeated times""" + + def configure_pipeline( + self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any] + ) -> PipelineConfig: + pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from) + # data preprocessing + self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append( + MultiplyTransform(n_repeats=5) + ) + return pipeline From 4c7aa342e141dc06cc38f228811aadac05ac9725 Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Fri, 18 Apr 2025 22:36:10 -0700 Subject: [PATCH 4/8] Add Phi specific pipelines that filter out COT --- .../data_utils/arc_agi_utils.py | 30 ++++++++++ eureka_ml_insights/user_configs/__init__.py | 2 + eureka_ml_insights/user_configs/arc_agi.py | 55 ++++++++++++++++++- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/eureka_ml_insights/data_utils/arc_agi_utils.py b/eureka_ml_insights/data_utils/arc_agi_utils.py index cd0392d3..29e5958f 100644 --- a/eureka_ml_insights/data_utils/arc_agi_utils.py +++ b/eureka_ml_insights/data_utils/arc_agi_utils.py @@ -37,3 +37,33 @@ def parse_output_answer(response): answer = response[start_index:end_index].strip() return answer + + +@dataclass +class ARCAGI_CleanCOTAnswer(DFTransformBase): + model_output_column: str + model_answer_column: str + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[self.model_answer_column] = df[self.model_output_column].apply(self.parse_output_answer) + return df + + @staticmethod + def parse_output_answer(response): + """ + Replace None responses with an empty string + Parameters: + response (str): Possibly None Response string + Returns: + answer (str): Response string with None replaced by blank string + """ + if response is None: + return "" + + start_index = response.find("") + len("") + if start_index == -1: + return response + + response = response[start_index:] + + return response diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py index c2aee9f8..7a671786 100644 --- a/eureka_ml_insights/user_configs/__init__.py +++ b/eureka_ml_insights/user_configs/__init__.py @@ -8,6 +8,8 @@ from .arc_agi import ( ARC_AGI_v1_PIPELINE, ARC_AGI_v1_PIPELINE_5Run, + Phi_ARC_AGI_v1_PIPELINE, + Phi_ARC_AGI_v1_PIPELINE_5Run, ) from .ba_calendar import ( BA_Calendar_Parallel_PIPELINE, diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py index a723ff1a..c9d40a6e 100644 --- a/eureka_ml_insights/user_configs/arc_agi.py +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -6,6 +6,7 @@ from eureka_ml_insights.core.eval_reporting import EvalReporting from eureka_ml_insights.data_utils.arc_agi_utils import ( ARCAGI_ExtractAnswer, + ARCAGI_CleanCOTAnswer, ) from eureka_ml_insights.data_utils.data import ( DataLoader, @@ -91,13 +92,29 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= if resume_logdir: self.log_dir = resume_from.split("/")[0:len(resume_from.split("/")) - 1] + # Configure the data post processing component. + self.data_post_processing = DataProcessingConfig( + component_type=DataProcessing, + data_reader_config=DataSetConfig( + DataReader, + { + "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"), + "format": ".jsonl", + "transform": SequenceTransform( + [] + ), + }, + ), + output_dir=os.path.join(self.log_dir, "data_post_processing_output"), + ) + # Configure the evaluation and reporting component for evaluation and dataset level aggregation self.evalreporting_comp = EvalReportingConfig( component_type=EvalReporting, data_reader_config=DataSetConfig( DataReader, { - "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"), + "path": os.path.join(self.data_post_processing.output_dir, "transformed_data.jsonl"), "format": ".jsonl", "transform": SequenceTransform( [ @@ -126,8 +143,6 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= "group_by": "split", }, ), - # the next three reports take the average and std for all repeats - # the resulting numbers are the average and std of N pass@1 scores, where N is number of repeats AggregatorConfig( CountAggregator, { @@ -206,6 +221,7 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= [ self.data_processing_comp, self.inference_comp, + self.data_post_processing, self.evalreporting_comp, self.posteval_data_post_processing_comp, self.best_of_n_evalreporting_comp, @@ -214,6 +230,25 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= ) +class Phi_ARC_AGI_v1_PIPELINE(ARC_AGI_v1_PIPELINE): + def configure_pipeline(self, model_config=None, resume_from=None, **kwargs): + config = super().configure_pipeline(model_config=model_config, resume_from=resume_from) + self.data_post_processing.data_reader_config.init_args["transform"] = SequenceTransform( + [ + ColumnRename( + name_mapping={ + "model_output": "cot_model_output", + } + ), + AddColumn("post_cot_model_output"), + # RunPythonTransform("df['post_cot_model_output'] = df['post_cot_model_output'].apply(lambda x: x.split('')[-1] if '' in x else x)"), + ARCAGI_CleanCOTAnswer("cot_model_output", "post_cot_model_output"), + CopyColumn("post_cot_model_output", "model_output"), + ] + ) + return config + + class ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): """This class specifies the config for running the GPQA benchmark 5 repeated times""" @@ -226,3 +261,17 @@ def configure_pipeline( MultiplyTransform(n_repeats=5) ) return pipeline + + +class Phi_ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): + """This class specifies the config for running the GPQA benchmark 5 repeated times""" + + def configure_pipeline( + self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any] + ) -> PipelineConfig: + pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from) + # data preprocessing + self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append( + MultiplyTransform(n_repeats=5) + ) + return pipeline From 6e49d404660eaa895f97e66986ab49fe327523fb Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Tue, 22 Apr 2025 10:04:38 -0700 Subject: [PATCH 5/8] Add DeepSeek R1 VLLM model config --- eureka_ml_insights/configs/model_configs.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/eureka_ml_insights/configs/model_configs.py b/eureka_ml_insights/configs/model_configs.py index 0f1cc3ce..8fbd0b5f 100644 --- a/eureka_ml_insights/configs/model_configs.py +++ b/eureka_ml_insights/configs/model_configs.py @@ -172,6 +172,15 @@ }, ) +OAI_GPT4O_2024_11_20_AUZRE_CONFIG = ModelConfig( + AzureOpenAIModel, + { + "model_name": "gpt-4o", + "url": "https://eurekaevals.openai.azure.com/", + "api_version": "2025-01-01-preview", + }, +) + # Gemini models GEMINI_SECRET_KEY_PARAMS = { "key_name": "your_gemini_secret_key_name", @@ -343,6 +352,16 @@ }, ) +DEEPSEEK_R1_LOCAL_CONFIG = ModelConfig( + LocalVLLMModel, + { + # this name must match the vllm deployment name/path + "model_name": "Deepseek-R1", + # specify ports in case the model is already deployed + "ports": ["5001"], + }, +) + # DeepSeek R1 Endpoints on Azure DEEPSEEK_R1_CONFIG = ModelConfig( DeepseekR1ServerlessAzureRestEndpointModel, From c347ff31f8f6af227358e4b87341235fc6ae243b Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Wed, 23 Apr 2025 14:03:31 -0700 Subject: [PATCH 6/8] Add prompt template with grid explanation --- .../arc_agi_v1_grid_explanation.jinja | 21 +++++++++++++++++++ eureka_ml_insights/user_configs/arc_agi.py | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_grid_explanation.jinja diff --git a/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_grid_explanation.jinja b/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_grid_explanation.jinja new file mode 100644 index 00000000..d87e3fec --- /dev/null +++ b/eureka_ml_insights/prompt_templates/arc_agi_templates/arc_agi_v1_grid_explanation.jinja @@ -0,0 +1,21 @@ +You are an intelligent assistant who is very good at answering test questions accurately. +In the examples that follow you will be shown grids of numbers. +The numbers in the grids range from 0 through 9. +Each grid can be rendered as a grid of squares. +Each square in the grid is rendered as a colored square where the color of the square is derived from the number. +The colors are decided as follows: + +0 - black +1 - blue +2 - red +3 - green +4 - yellow +5 - grey +6 - magenta +7 - brown +8 - cyan +9 - maroon + +With that in mind, do your best to solve the question below. + +{{ prompt }} \ No newline at end of file diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py index c9d40a6e..85221cf7 100644 --- a/eureka_ml_insights/user_configs/arc_agi.py +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -59,7 +59,7 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= self.data_processing_comp = PromptProcessingConfig( component_type=PromptProcessing, prompt_template_path=os.path.join( - os.path.dirname(__file__), "../prompt_templates/arc_agi_templates/arc_agi_v1_basic.jinja" + os.path.dirname(__file__), "../prompt_templates/arc_agi_templates/arc_agi_v1_grid_explanation.jinja" ), data_reader_config=DataSetConfig( HFDataReader, From 97301d321774e32eb9f7844e55c2325129148847 Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Wed, 23 Apr 2025 14:06:22 -0700 Subject: [PATCH 7/8] Rename Phie experiment pipelines to be COT pipelines --- eureka_ml_insights/user_configs/arc_agi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py index 85221cf7..cb07b6c8 100644 --- a/eureka_ml_insights/user_configs/arc_agi.py +++ b/eureka_ml_insights/user_configs/arc_agi.py @@ -230,7 +230,7 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= ) -class Phi_ARC_AGI_v1_PIPELINE(ARC_AGI_v1_PIPELINE): +class COT_ARC_AGI_v1_PIPELINE(ARC_AGI_v1_PIPELINE): def configure_pipeline(self, model_config=None, resume_from=None, **kwargs): config = super().configure_pipeline(model_config=model_config, resume_from=resume_from) self.data_post_processing.data_reader_config.init_args["transform"] = SequenceTransform( @@ -263,7 +263,7 @@ def configure_pipeline( return pipeline -class Phi_ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): +class COT_ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): """This class specifies the config for running the GPQA benchmark 5 repeated times""" def configure_pipeline( From 4198c42fbabbf96a52830c821477f962c3d36cec Mon Sep 17 00:00:00 2001 From: Xavier Fernandes Date: Tue, 12 Aug 2025 11:31:08 -0700 Subject: [PATCH 8/8] Add unit tests --- eureka_ml_insights/user_configs/__init__.py | 4 +- tests/pipeline_tests.py | 140 ++++++++++++++++++++ 2 files changed, 142 insertions(+), 2 deletions(-) diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py index 7a671786..dc20a49f 100644 --- a/eureka_ml_insights/user_configs/__init__.py +++ b/eureka_ml_insights/user_configs/__init__.py @@ -8,8 +8,8 @@ from .arc_agi import ( ARC_AGI_v1_PIPELINE, ARC_AGI_v1_PIPELINE_5Run, - Phi_ARC_AGI_v1_PIPELINE, - Phi_ARC_AGI_v1_PIPELINE_5Run, + COT_ARC_AGI_v1_PIPELINE, + COT_ARC_AGI_v1_PIPELINE_5Run, ) from .ba_calendar import ( BA_Calendar_Parallel_PIPELINE, diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py index 2b1ca345..8229351a 100644 --- a/tests/pipeline_tests.py +++ b/tests/pipeline_tests.py @@ -24,6 +24,10 @@ from eureka_ml_insights.user_configs import ( AIME_PIPELINE, AIME_SEQ_PIPELINE, + ARC_AGI_v1_PIPELINE, + ARC_AGI_v1_PIPELINE_5Run, + COT_ARC_AGI_v1_PIPELINE, + COT_ARC_AGI_v1_PIPELINE_5Run, DNA_PIPELINE, GEOMETER_PIPELINE, GSM8K_PIPELINE, @@ -400,6 +404,38 @@ def configure_pipeline(self): return config +class TEST_ARC_AGI_v1_PIPELINE(ARC_AGI_v1_PIPELINE): + # Test config the BA Calendar benchmark with TestModel and TestDataLoader + def configure_pipeline(self): + config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {})) + self.data_processing_comp.data_reader_config.class_name = TestHFDataReader + return config + + +class TEST_ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE_5Run): + # Test config the BA Calendar benchmark with TestModel and TestDataLoader + def configure_pipeline(self): + config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {})) + self.data_processing_comp.data_reader_config.class_name = TestHFDataReader + return config + + +class TEST_COT_ARC_AGI_v1_PIPELINE(COT_ARC_AGI_v1_PIPELINE): + # Test config the BA Calendar benchmark with TestModel and TestDataLoader + def configure_pipeline(self): + config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {})) + self.data_processing_comp.data_reader_config.class_name = TestHFDataReader + return config + + +class TEST_COT_ARC_AGI_v1_PIPELINE_5Run(COT_ARC_AGI_v1_PIPELINE_5Run): + # Test config the BA Calendar benchmark with TestModel and TestDataLoader + def configure_pipeline(self): + config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {})) + self.data_processing_comp.data_reader_config.class_name = TestHFDataReader + return config + + class PipelineTest: def setUp(self) -> None: self.conf = self.get_config() @@ -661,5 +697,109 @@ def get_config(self): return TEST_GSMSYMBOLIC_PIPELINE().pipeline_config +class ARC_AGI_v1_PipelineTest(PipelineTest, unittest.TestCase): + def get_config(self): + self.test_pipeline = TEST_ARC_AGI_v1_PIPELINE() + self.config = self.test_pipeline.pipeline_config + return self.config + + def setUp(self) -> None: + super().setUp() + self.eval_configs = [ + self.test_pipeline.evalreporting_comp, + self.test_pipeline.best_of_n_evalreporting_comp + ] + + def test_outputs_exist(self) -> None: + logging.info("Running test_outputs_exist test in PipelineTest") + self.assertTrue(any("transformed_data.jsonl" in str(file) for file in self.files)) + if self.data_reader_config.prompt_template_path: + self.assertTrue(any("processed_prompts.jsonl" in str(file) for file in self.files)) + self.assertTrue(any("inference_result.jsonl" in str(file) for file in self.files)) + if self.eval_config.metric_config is not None: + self.assertTrue(any("metric_results.jsonl" in str(file) for file in self.files)) + n_aggregators = len([config for eval_config in self.eval_configs for config in eval_config.aggregator_configs]) + n_aggregator_files = len([file for file in self.files if "aggregator" in str(file)]) + self.assertEqual(n_aggregators, n_aggregator_files) + + +class ARC_AGI_v1_Pipeline_5RunTest(PipelineTest, unittest.TestCase): + def get_config(self): + self.test_pipeline = TEST_ARC_AGI_v1_PIPELINE_5Run() + self.config = self.test_pipeline.pipeline_config + return self.config + + def setUp(self) -> None: + super().setUp() + self.eval_configs = [ + self.test_pipeline.evalreporting_comp, + self.test_pipeline.best_of_n_evalreporting_comp + ] + + def test_outputs_exist(self) -> None: + logging.info("Running test_outputs_exist test in PipelineTest") + self.assertTrue(any("transformed_data.jsonl" in str(file) for file in self.files)) + if self.data_reader_config.prompt_template_path: + self.assertTrue(any("processed_prompts.jsonl" in str(file) for file in self.files)) + self.assertTrue(any("inference_result.jsonl" in str(file) for file in self.files)) + if self.eval_config.metric_config is not None: + self.assertTrue(any("metric_results.jsonl" in str(file) for file in self.files)) + n_aggregators = len([config for eval_config in self.eval_configs for config in eval_config.aggregator_configs]) + n_aggregator_files = len([file for file in self.files if "aggregator" in str(file)]) + self.assertEqual(n_aggregators, n_aggregator_files) + + +class COT_ARC_AGI_v1_PIPELINETest(PipelineTest, unittest.TestCase): + def get_config(self): + self.test_pipeline = TEST_COT_ARC_AGI_v1_PIPELINE() + self.config = self.test_pipeline.pipeline_config + return self.config + + def setUp(self) -> None: + super().setUp() + self.eval_configs = [ + self.test_pipeline.evalreporting_comp, + self.test_pipeline.best_of_n_evalreporting_comp + ] + + def test_outputs_exist(self) -> None: + logging.info("Running test_outputs_exist test in PipelineTest") + self.assertTrue(any("transformed_data.jsonl" in str(file) for file in self.files)) + if self.data_reader_config.prompt_template_path: + self.assertTrue(any("processed_prompts.jsonl" in str(file) for file in self.files)) + self.assertTrue(any("inference_result.jsonl" in str(file) for file in self.files)) + if self.eval_config.metric_config is not None: + self.assertTrue(any("metric_results.jsonl" in str(file) for file in self.files)) + n_aggregators = len([config for eval_config in self.eval_configs for config in eval_config.aggregator_configs]) + n_aggregator_files = len([file for file in self.files if "aggregator" in str(file)]) + self.assertEqual(n_aggregators, n_aggregator_files) + + +class COT_ARC_AGI_v1_PIPELINE_5RunTest(PipelineTest, unittest.TestCase): + def get_config(self): + self.test_pipeline = TEST_COT_ARC_AGI_v1_PIPELINE_5Run() + self.config = self.test_pipeline.pipeline_config + return self.config + + def setUp(self) -> None: + super().setUp() + self.eval_configs = [ + self.test_pipeline.evalreporting_comp, + self.test_pipeline.best_of_n_evalreporting_comp + ] + + def test_outputs_exist(self) -> None: + logging.info("Running test_outputs_exist test in PipelineTest") + self.assertTrue(any("transformed_data.jsonl" in str(file) for file in self.files)) + if self.data_reader_config.prompt_template_path: + self.assertTrue(any("processed_prompts.jsonl" in str(file) for file in self.files)) + self.assertTrue(any("inference_result.jsonl" in str(file) for file in self.files)) + if self.eval_config.metric_config is not None: + self.assertTrue(any("metric_results.jsonl" in str(file) for file in self.files)) + n_aggregators = len([config for eval_config in self.eval_configs for config in eval_config.aggregator_configs]) + n_aggregator_files = len([file for file in self.files if "aggregator" in str(file)]) + self.assertEqual(n_aggregators, n_aggregator_files) + + if __name__ == "__main__": unittest.main()