|
29 | 29 | ExtractUsageTransform, |
30 | 30 | MajorityVoteTransform, |
31 | 31 | MultiplyTransform, |
| 32 | + ReplaceStringsTransform, |
32 | 33 | RunPythonTransform, |
33 | 34 | SamplerTransform, |
34 | 35 | SequenceTransform, |
@@ -64,6 +65,11 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= |
64 | 65 | { |
65 | 66 | "path": "pxferna/ARC-AGI-v1", |
66 | 67 | "split": "test", |
| 68 | + "transform": SequenceTransform( |
| 69 | + [ |
| 70 | + MultiplyTransform(n_repeats=1), |
| 71 | + ] |
| 72 | + ), |
67 | 73 | } |
68 | 74 | ), |
69 | 75 | output_dir=os.path.join(self.log_dir, "data_processing_output"), |
@@ -135,12 +141,88 @@ def configure_pipeline(self, model_config=None, resume_from=None, resume_logdir= |
135 | 141 | output_dir=os.path.join(self.log_dir, "eval_report"), |
136 | 142 | ) |
137 | 143 |
|
| 144 | + self.posteval_data_post_processing_comp = DataProcessingConfig( |
| 145 | + component_type=DataProcessing, |
| 146 | + data_reader_config=DataSetConfig( |
| 147 | + DataReader, |
| 148 | + { |
| 149 | + "path": os.path.join(self.evalreporting_comp.output_dir, "metric_results.jsonl"), |
| 150 | + "format": ".jsonl", |
| 151 | + "transform": SequenceTransform( |
| 152 | + [ |
| 153 | + CopyColumn( |
| 154 | + column_name_src="ExactMatch_result", |
| 155 | + column_name_dst="ExactMatch_result_numeric", |
| 156 | + ), |
| 157 | + ReplaceStringsTransform( |
| 158 | + columns=["ExactMatch_result_numeric"], |
| 159 | + mapping={'incorrect': '0', 'correct': '1', 'none': 'NaN'}, |
| 160 | + case=False) |
| 161 | + ] |
| 162 | + ), |
| 163 | + }, |
| 164 | + ), |
| 165 | + output_dir=os.path.join(self.log_dir, "posteval_data_post_processing_output"), |
| 166 | + ) |
| 167 | + |
| 168 | + self.best_of_n_evalreporting_comp = EvalReportingConfig( |
| 169 | + component_type=EvalReporting, |
| 170 | + data_reader_config=DataSetConfig( |
| 171 | + DataReader, |
| 172 | + { |
| 173 | + "path": os.path.join(self.posteval_data_post_processing_comp.output_dir, "transformed_data.jsonl"), |
| 174 | + "format": ".jsonl" |
| 175 | + }, |
| 176 | + ), |
| 177 | + aggregator_configs=[ |
| 178 | + AggregatorConfig( |
| 179 | + BiLevelAggregator, |
| 180 | + { |
| 181 | + "column_names": [ |
| 182 | + "ExactMatch_result_numeric", |
| 183 | + ], |
| 184 | + "first_groupby": "uid", |
| 185 | + "filename_base": "ExactMatch_Total_BestOfN", |
| 186 | + }), |
| 187 | + # the first three reports aggregate results by data_point_id and take the best out of N |
| 188 | + AggregatorConfig( |
| 189 | + BiLevelAggregator, |
| 190 | + { |
| 191 | + "column_names": [ |
| 192 | + "ExactMatch_result_numeric" |
| 193 | + ], |
| 194 | + "first_groupby": "uid", |
| 195 | + "second_groupby": "split", |
| 196 | + "filename_base": "ExactMatch_Grouped_BestOfN", |
| 197 | + "agg_fn": "max" |
| 198 | + }, |
| 199 | + ), |
| 200 | + ], |
| 201 | + output_dir=os.path.join(self.log_dir, "bestofn_eval_report"), |
| 202 | + ) |
| 203 | + |
138 | 204 | # Configure the pipeline |
139 | 205 | return PipelineConfig( |
140 | 206 | [ |
141 | 207 | self.data_processing_comp, |
142 | 208 | self.inference_comp, |
143 | 209 | self.evalreporting_comp, |
| 210 | + self.posteval_data_post_processing_comp, |
| 211 | + self.best_of_n_evalreporting_comp, |
144 | 212 | ], |
145 | 213 | self.log_dir, |
146 | 214 | ) |
| 215 | + |
| 216 | + |
| 217 | +class ARC_AGI_v1_PIPELINE_5Run(ARC_AGI_v1_PIPELINE): |
| 218 | + """This class specifies the config for running the GPQA benchmark 5 repeated times""" |
| 219 | + |
| 220 | + def configure_pipeline( |
| 221 | + self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any] |
| 222 | + ) -> PipelineConfig: |
| 223 | + pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from) |
| 224 | + # data preprocessing |
| 225 | + self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append( |
| 226 | + MultiplyTransform(n_repeats=5) |
| 227 | + ) |
| 228 | + return pipeline |
0 commit comments