|
2 | 2 | import json |
3 | 3 | import os |
4 | 4 | import re |
| 5 | +import shutil |
5 | 6 | import sys |
6 | 7 | from collections import defaultdict |
7 | 8 | from typing import Any, Dict, List, Tuple |
8 | 9 |
|
9 | 10 | from src.model import Model |
10 | | -from src.utils.capability_utils import parse_python_class_str, read_score_inspect_json |
11 | | -from src.utils.constants import ( |
12 | | - NO_ANSWER_STR, |
13 | | - NON_SEED_CAPABILITIES_SCORE_DIR, |
14 | | - SEED_CAPABILITIES_SCORE_DIR, |
15 | | - TAB_W_SPACES, |
| 11 | +from src.utils import constants |
| 12 | +from src.utils.capability_utils import ( |
| 13 | + parse_python_class_str, |
| 14 | + read_score_inspect_json, |
| 15 | + run_inspect_evals, |
| 16 | +) |
| 17 | +from src.utils.data_utils import ( |
| 18 | + list_dir, |
| 19 | + load_data, |
| 20 | + path_exists, |
| 21 | + transfer_inspect_log_to_gcp, |
16 | 22 | ) |
17 | | -from src.utils.data_utils import load_data |
18 | 23 | from src.utils.prompts import TASK_SOLVER_SYSTEM_PROMPT |
| 24 | +from src.utils.templates import ( |
| 25 | + INSPECT_EVALS_INIT_FILE_TEMPLATE, |
| 26 | + INSPECT_EVALS_README_FILE_TEMPLATE, |
| 27 | + INSPECT_EVALS_SCRIPT_FILE_TEMPLATE, |
| 28 | +) |
19 | 29 |
|
20 | 30 |
|
21 | 31 | class CapabilitySeedDataset: |
@@ -100,9 +110,9 @@ def __init__(self, capability_dir: str) -> None: |
100 | 110 | self._load_capability_repr_class() |
101 | 111 |
|
102 | 112 | self.score_dir = ( |
103 | | - SEED_CAPABILITIES_SCORE_DIR |
| 113 | + constants.SEED_CAPABILITIES_SCORE_DIR |
104 | 114 | if self.is_seed |
105 | | - else NON_SEED_CAPABILITIES_SCORE_DIR |
| 115 | + else constants.NON_SEED_CAPABILITIES_SCORE_DIR |
106 | 116 | ) |
107 | 117 |
|
108 | 118 | @classmethod |
@@ -208,11 +218,11 @@ def load_scores(self, scores_dir: str | None = None) -> Dict[str, float]: |
208 | 218 | """ |
209 | 219 | scores_dir = scores_dir if scores_dir else self.score_dir |
210 | 220 | scores_dict = defaultdict(float) |
211 | | - for model in os.listdir(scores_dir): |
| 221 | + for model in list_dir(scores_dir): |
212 | 222 | scores_file = os.path.join( |
213 | 223 | scores_dir, model, self.domain, f"{self.name}.json" |
214 | 224 | ) |
215 | | - if os.path.isfile(scores_file): |
| 225 | + if path_exists(scores_file): |
216 | 226 | scores_dict[model] = read_score_inspect_json(scores_file) |
217 | 227 | return scores_dict |
218 | 228 |
|
@@ -286,8 +296,8 @@ def add_and_update_tasks(self, tasks: List[Dict[str, Any]]) -> None: |
286 | 296 | # Update the capability class python file |
287 | 297 | # Extract str which contains the repr_tasks dictionary |
288 | 298 | # TODO: Since these are hardcoded, update when the format changes |
289 | | - prefix_str = f"def repr_tasks() -> dict[str, dict]:\n{TAB_W_SPACES}{TAB_W_SPACES}return " |
290 | | - suffix_str = f"\n\n{TAB_W_SPACES}@staticmethod\n{TAB_W_SPACES}def get_instructions(t: dict) -> str:" |
| 299 | + prefix_str = f"def repr_tasks() -> dict[str, dict]:\n{constants.TAB_W_SPACES}{constants.TAB_W_SPACES}return " |
| 300 | + suffix_str = f"\n\n{constants.TAB_W_SPACES}@staticmethod\n{constants.TAB_W_SPACES}def get_instructions(t: dict) -> str:" |
291 | 301 | prev_repr_tasks_str = self.capability_repr_class_str.split(prefix_str)[ |
292 | 302 | 1 |
293 | 303 | ].split(suffix_str)[0] |
@@ -412,7 +422,7 @@ def _solve_task( |
412 | 422 | # and the answer is incomplete? |
413 | 423 | answer_pattern = r"(?i)ANSWER\s*:\s*([^\n]+)" |
414 | 424 | match = re.search(answer_pattern, response) |
415 | | - answer = match.group(1) if match else NO_ANSWER_STR |
| 425 | + answer = match.group(1) if match else constants.NO_ANSWER_STR |
416 | 426 | metadata = { |
417 | 427 | "raw_response": response, |
418 | 428 | "api_metadata": metadata, |
@@ -466,37 +476,152 @@ def get_tasks(self) -> List[Dict[str, Any]]: |
466 | 476 | """ |
467 | 477 | return self._data |
468 | 478 |
|
469 | | - def _create_inspect_file(self) -> None: |
| 479 | + def _create_inspect_file(self, path: str) -> None: |
470 | 480 | """ |
471 | 481 | Implement pipeline to evaluate the capability using the inspect framework. |
472 | 482 |
|
473 | 483 | This involves converting the METR format to inspect solvers and scorers. |
474 | 484 | """ |
475 | | - raise NotImplementedError |
| 485 | + # Create JSONL dataset and store it under the inspect path |
| 486 | + dataset = self.get_tasks() |
| 487 | + dataset_metadata_keys = [ |
| 488 | + k for k in list(dataset[0].keys()) if k not in ["id", "problem", "answer"] |
| 489 | + ] |
| 490 | + # Write data to a dataset JSONL file |
| 491 | + with open(os.path.join(path, "dataset.jsonl"), "w") as f: |
| 492 | + for elm in dataset: |
| 493 | + f.write(json.dumps(elm) + "\n") |
| 494 | + |
| 495 | + # Create __init__.py and README files |
| 496 | + # TODO: Add more details to the README file |
| 497 | + init_file_content = INSPECT_EVALS_INIT_FILE_TEMPLATE.format( |
| 498 | + capability_name=self.name, |
| 499 | + ).strip("\n") |
| 500 | + with open(os.path.join(path, "__init__.py"), "w") as f: |
| 501 | + f.write(init_file_content) |
| 502 | + readme_file_content = INSPECT_EVALS_README_FILE_TEMPLATE.format( |
| 503 | + capability_name=self.name, |
| 504 | + capability_description=self.description, |
| 505 | + ).strip("\n") |
| 506 | + with open(os.path.join(path, "README.md"), "w") as f: |
| 507 | + f.write(readme_file_content) |
| 508 | + |
| 509 | + # Create inspect evals script file |
| 510 | + # TODO: How to handle more involved score functions? |
| 511 | + # TODO: Do we need system prompt? |
| 512 | + instruction_template = self.capability_repr_class.get_instructions( |
| 513 | + {"problem": "{prompt}"} |
| 514 | + ) |
| 515 | + score_func_prefix = f"@staticmethod\n{constants.TAB_W_SPACES}def score" |
| 516 | + score_func_prefix_new = ( |
| 517 | + f"async {score_func_prefix.split(constants.TAB_W_SPACES)[1]}".replace( |
| 518 | + "score", "_score" |
| 519 | + ) |
| 520 | + ) |
| 521 | + score_func_str = f"{score_func_prefix_new}{self.capability_repr_class_str.split(score_func_prefix)[1].replace((constants.TAB_W_SPACES + constants.TAB_W_SPACES), constants.TAB_W_SPACES)}".strip( |
| 522 | + "`" |
| 523 | + ).strip("\n") |
| 524 | + script_file_content = INSPECT_EVALS_SCRIPT_FILE_TEMPLATE.format( |
| 525 | + capability_name=self.name, |
| 526 | + dataset_metadata_keys=json.dumps(dataset_metadata_keys), |
| 527 | + prompt_template=instruction_template, |
| 528 | + score_func_t_dict_str='{"answer": target.text}', |
| 529 | + score_func_str=score_func_str, |
| 530 | + ) |
| 531 | + script_file_path = os.path.join(path, f"{self.name}.py") |
| 532 | + with open(script_file_path, "w") as f: |
| 533 | + f.write(script_file_content) |
| 534 | + # TODO: Validate formatting of script file |
| 535 | + _ = _import_from_path( |
| 536 | + module_name=f"{self.name}_inspect_eval_script", file_path=script_file_path |
| 537 | + ) |
476 | 538 |
|
477 | | - def _evaluate_using_inspect(self, subject_llm: Model) -> None: # noqa: D102 |
| 539 | + def _evaluate_using_inspect(self, subject_llm: Model, **kwargs: Any) -> None: |
478 | 540 | """ |
479 | | - Evaluate subject LLM on the capability using the inspect framework. |
| 541 | + Evaluate the subject LLM on the capability using the Inspect framework. |
480 | 542 |
|
481 | | - Args |
482 | | - ---- |
483 | | - subject_llm : Model |
484 | | - The LLM to use for evaluation. |
| 543 | + This method uses the Inspect evaluation framework to assess the performance of |
| 544 | + the provided language model (LLM) on a specific capability. It ensures that the |
| 545 | + required evaluation files exist, temporarily stores logs locally, and transfers |
| 546 | + them to a GCP bucket after the evaluation is complete. |
| 547 | +
|
| 548 | + Args: |
| 549 | + subject_llm (Model): The LLM model to evaluate. |
| 550 | + **kwargs (Any): Additional args for running the evals. |
| 551 | +
|
| 552 | + Raises |
| 553 | + ------ |
| 554 | + FileNotFoundError: If the required Inspect evaluation path does not exist. |
485 | 555 | """ |
486 | | - raise NotImplementedError |
| 556 | + inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name) |
| 557 | + if not os.path.exists(inspect_path): |
| 558 | + raise FileNotFoundError( |
| 559 | + f"Inspect evaluation path does not exist: {inspect_path}. " |
| 560 | + "Please ensure the inspect files are created before evaluation." |
| 561 | + ) |
| 562 | + # Temporarily store the logs locally and then transfer them to the GCP bucket, |
| 563 | + # since Inspect does not support GCP bucket paths for storing logs |
| 564 | + log_dir = os.path.join( |
| 565 | + self.score_dir.replace( |
| 566 | + constants.GCP_BASE_ARTIFACTS_DIR, constants.BASE_ARTIFACTS_DIR |
| 567 | + ), |
| 568 | + subject_llm.get_model_name(), |
| 569 | + self.domain, |
| 570 | + self.name, |
| 571 | + ) |
| 572 | + os.makedirs(log_dir, exist_ok=True) |
487 | 573 |
|
488 | | - def evaluate(self, subject_llms: List[Model]) -> None: |
| 574 | + run_inspect_evals( |
| 575 | + path=self.name, |
| 576 | + model=subject_llm, |
| 577 | + log_dir=log_dir, |
| 578 | + **kwargs, |
| 579 | + ) |
| 580 | + |
| 581 | + # Transfer the logs to the GCP bucket |
| 582 | + transfer_inspect_log_to_gcp( |
| 583 | + src_dir=log_dir, |
| 584 | + gcp_dir=log_dir.replace( |
| 585 | + constants.BASE_ARTIFACTS_DIR, constants.GCP_BASE_ARTIFACTS_DIR |
| 586 | + ), |
| 587 | + ) |
| 588 | + # Remove the local logs |
| 589 | + shutil.rmtree(log_dir) |
| 590 | + |
| 591 | + def evaluate( |
| 592 | + self, subject_llms: List[Model], gen_args: List[Dict[Any, Any]] |
| 593 | + ) -> None: |
489 | 594 | """ |
490 | 595 | Evaluate the provided subject LLMs on the capability. |
491 | 596 |
|
492 | 597 | Args |
493 | 598 | ---- |
494 | 599 | subject_llms : List[Model] |
495 | 600 | The list of LLMs to use for evaluation. |
| 601 | + gen_args : List[Dict[Any, Any]] |
| 602 | + The list of generation configurations corresponding to each LLM. |
496 | 603 | """ |
| 604 | + assert len(subject_llms) == len(gen_args), ( |
| 605 | + "Each subject LLM must have a corresponding generation config." |
| 606 | + ) |
| 607 | + # Create inspect script if evaluating for the first time |
| 608 | + inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name) |
| 609 | + if not os.path.exists(inspect_path): |
| 610 | + os.makedirs(inspect_path) |
| 611 | + self._create_inspect_file(path=inspect_path) |
| 612 | + |
| 613 | + # Change dir to where inspect eval scrips are stored |
| 614 | + # because inspect evals does not support non-relative paths |
| 615 | + cwd = os.getcwd() |
| 616 | + os.chdir(constants.BASE_INSPECT_EVALS_DIR) |
497 | 617 | # TODO: Run asynchronosly |
498 | | - for model in subject_llms: |
499 | | - self._evaluate_using_inspect(model) |
| 618 | + for model_idx, model in enumerate(subject_llms): |
| 619 | + self._evaluate_using_inspect( |
| 620 | + subject_llm=model, |
| 621 | + **gen_args[model_idx], |
| 622 | + ) |
| 623 | + # Revert to original working dir after evaluation |
| 624 | + os.chdir(cwd) |
500 | 625 |
|
501 | 626 |
|
502 | 627 | def _import_from_path(module_name: str, file_path: str) -> Any: |
|
0 commit comments