Add Phi specific pipelines that filter out COT

ilmarinen · ilmarinen · commit 93c8caf7fbf6 · 2025-06-30T21:37:53.000-07:00
diff --git a/eureka_ml_insights/data_utils/arc_agi_utils.py b/eureka_ml_insights/data_utils/arc_agi_utils.py
@@ -37,3 +37,33 @@ def parse_output_answer(response):
         answer = response[start_index:end_index].strip()
 
         return answer
+
+
+@dataclass
+class ARCAGI_CleanCOTAnswer(DFTransformBase):
+    model_output_column: str
+    model_answer_column: str
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.model_answer_column] = df[self.model_output_column].apply(self.parse_output_answer)
+        return df
+
+    @staticmethod
+    def parse_output_answer(response):
+        """
+        Replace None responses with an empty string
+        Parameters:
+            response (str): Possibly None Response string
+        Returns: 
+            answer (str): Response string with None replaced by blank string
+        """
+        if response is None:
+            return ""
+        
+        start_index = response.find("</think>") + len("</think>")
+        if start_index == -1:
+            return response
+        
+        response = response[start_index:]
+
+        return response
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -8,6 +8,8 @@
 from .arc_agi import (
     ARC_AGI_v1_PIPELINE,
     ARC_AGI_v1_PIPELINE_5Run,
+    Phi_ARC_AGI_v1_PIPELINE,
+    Phi_ARC_AGI_v1_PIPELINE_5Run,
 )
 from .ba_calendar import (
     BA_Calendar_Parallel_PIPELINE,
diff --git a/eureka_ml_insights/user_configs/arc_agi.py b/eureka_ml_insights/user_configs/arc_agi.py
@@ -6,6 +6,7 @@
 from eureka_ml_insights.core.eval_reporting import EvalReporting
 from eureka_ml_insights.data_utils.arc_agi_utils import (
     ARCAGI_ExtractAnswer,
+    ARCAGI_CleanCOTAnswer,
 )
 from eureka_ml_insights.data_utils.data import (
     DataLoader,
@@ -91,13 +92,29 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir=
         if resume_logdir:
             self.log_dir = resume_from.split("/")[0:len(resume_from.split("/")) - 1]
 
+        # Configure the data post processing component.
+        self.data_post_processing = DataProcessingConfig(
+            component_type=DataProcessing,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        []
+                    ),
+                },
+            ),
+            output_dir=os.path.join(self.log_dir, "data_post_processing_output"),
+        )
+
         # Configure the evaluation and reporting component for evaluation and dataset level aggregation
         self.evalreporting_comp = EvalReportingConfig(
             component_type=EvalReporting,
             data_reader_config=DataSetConfig(
                 DataReader,
                 {
-                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "path": os.path.join(self.data_post_processing.output_dir, "transformed_data.jsonl"),
                     "format": ".jsonl",
                     "transform": SequenceTransform(
                         [
@@ -126,8 +143,6 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir=
                         "group_by": "split",
                     },
                 ),
-                # the next three reports take the average and std for all repeats
-                # the resulting numbers are the average and std of N pass@1 scores, where N is number of repeats
                 AggregatorConfig(
                     CountAggregator, 
                     {
@@ -206,6 +221,7 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir=
             [
                 self.data_processing_comp,
                 self.inference_comp,
+                self.data_post_processing,
                 self.evalreporting_comp,
                 self.posteval_data_post_processing_comp,
                 self.best_of_n_evalreporting_comp,
@@ -214,6 +230,25 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir=
         )
 
 
+class Phi_ARC_AGI_v1_PIPELINE(ARC_AGI_v1_PIPELINE):
+    def configure_pipeline(self, model_config=None, resume_from=None, **kwargs):
+        config = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        self.data_post_processing.data_reader_config.init_args["transform"] = SequenceTransform(
+            [
+                ColumnRename(
+                    name_mapping={
+                        "model_output": "cot_model_output",
+                    }
+                ),
+                AddColumn("post_cot_model_output"),
+                # RunPythonTransform("df['post_cot_model_output'] = df['post_cot_model_output'].apply(lambda x: x.split('</think>')[-1] if '</think>' in x else x)"),
+                ARCAGI_CleanCOTAnswer("cot_model_output", "post_cot_model_output"),
+                CopyColumn("post_cot_model_output", "model_output"),
+            ]
+        )
+        return config
+
+
 class ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE):
     """This class specifies the config for running the GPQA benchmark 5 repeated times"""
 
@@ -226,3 +261,17 @@ def configure_pipeline(
             MultiplyTransform(n_repeats=5)
         )
         return pipeline
+
+
+class Phi_ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE):
+    """This class specifies the config for running the GPQA benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        # data preprocessing
+        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
+            MultiplyTransform(n_repeats=5)
+        )
+        return pipeline

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,8 @@`
`8`	`8`	`from .arc_agi import (`
`9`	`9`	`ARC_AGI_v1_PIPELINE,`
`10`	`10`	`ARC_AGI_v1_PIPELINE_5Run,`
	`11`	`+ Phi_ARC_AGI_v1_PIPELINE,`
	`12`	`+ Phi_ARC_AGI_v1_PIPELINE_5Run,`
`11`	`13`	`)`
`12`	`14`	`from .ba_calendar import (`
`13`	`15`	`BA_Calendar_Parallel_PIPELINE,`