VectorInstitute
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 870 additions & 21 deletions b/‎poetry.lock‎
Lines changed: 870 additions & 21 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/capability.py‎
Lines changed: 151 additions & 26 deletions b/‎src/capability.py‎
Lines changed: 151 additions & 26 deletions
diff --git a/‎src/cfg/run_cfg.yaml‎
Lines changed: 9 additions & 2 deletions b/‎src/cfg/run_cfg.yaml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎src/create_seed_capabilities.py‎
Lines changed: 3 additions & 3 deletions b/‎src/create_seed_capabilities.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/generate_capabilities.py‎
Lines changed: 8 additions & 4 deletions b/‎src/generate_capabilities.py‎
Lines changed: 8 additions & 4 deletions
@@ -18,6 +18,14 @@ run:
 python3 -m poetry install --with test
 ```
 
+### [Optional] Google Cloud Authentication
+
+The capability evaluation logs (evaluated using [Inspect](https://inspect.aisi.org.uk/)) are stored in a GCP bucket. Use the following command to log in using your GCP account:
+
+```bash
+gcloud auth application-default login
+```
+
 ### Run pipeline with default config
 
 Note: Please set the following env vars before running the command.
@@ -30,6 +38,8 @@ Note: Please set the following env vars before running the command.
     - LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
     - LANGSMITH_API_KEY=<langsmith_api_key>
     - LANGSMITH_PROJECT="automated_capability_evaluation"
+- GCP env vars:
+    - GOOGLE_CLOUD_PROJECT=<project_id>
 
 ```bash
 python3 src/run.py
 
@@ -11,11 +11,13 @@ authors = [
 dynamic = ["version"]
 dependencies = [
     "datasets>=3.2.0",
+    "google-cloud-storage>=3.0.0",
     "hydra-core>=1.3.2",
+    "inspect-ai>=0.3.80",
     "langchain_openai>=0.3.6",
     "langchain>=0.3.19",
     "omegaconf>=2.3.0",
-    "openai>=1.61.1",
+    "openai>=1.68.0",
     "ratelimit>=2.2.1",
     "torchvision (>=0.21.0,<0.22.0)",
     "torchaudio (>=2.6.0,<3.0.0)",
 
@@ -2,20 +2,30 @@
 import json
 import os
 import re
+import shutil
 import sys
 from collections import defaultdict
 from typing import Any, Dict, List, Tuple
 
 from src.model import Model
-from src.utils.capability_utils import parse_python_class_str, read_score_inspect_json
-from src.utils.constants import (
-    NO_ANSWER_STR,
-    NON_SEED_CAPABILITIES_SCORE_DIR,
-    SEED_CAPABILITIES_SCORE_DIR,
-    TAB_W_SPACES,
+from src.utils import constants
+from src.utils.capability_utils import (
+    parse_python_class_str,
+    read_score_inspect_json,
+    run_inspect_evals,
+)
+from src.utils.data_utils import (
+    list_dir,
+    load_data,
+    path_exists,
+    transfer_inspect_log_to_gcp,
 )
-from src.utils.data_utils import load_data
 from src.utils.prompts import TASK_SOLVER_SYSTEM_PROMPT
+from src.utils.templates import (
+    INSPECT_EVALS_INIT_FILE_TEMPLATE,
+    INSPECT_EVALS_README_FILE_TEMPLATE,
+    INSPECT_EVALS_SCRIPT_FILE_TEMPLATE,
+)
 
 
 class CapabilitySeedDataset:
@@ -100,9 +110,9 @@ def __init__(self, capability_dir: str) -> None:
         self._load_capability_repr_class()
 
         self.score_dir = (
-            SEED_CAPABILITIES_SCORE_DIR
+            constants.SEED_CAPABILITIES_SCORE_DIR
             if self.is_seed
-            else NON_SEED_CAPABILITIES_SCORE_DIR
+            else constants.NON_SEED_CAPABILITIES_SCORE_DIR
         )
 
     @classmethod
@@ -208,11 +218,11 @@ def load_scores(self, scores_dir: str | None = None) -> Dict[str, float]:
         """
         scores_dir = scores_dir if scores_dir else self.score_dir
         scores_dict = defaultdict(float)
-        for model in os.listdir(scores_dir):
+        for model in list_dir(scores_dir):
             scores_file = os.path.join(
                 scores_dir, model, self.domain, f"{self.name}.json"
             )
-            if os.path.isfile(scores_file):
+            if path_exists(scores_file):
                 scores_dict[model] = read_score_inspect_json(scores_file)
         return scores_dict
 
@@ -286,8 +296,8 @@ def add_and_update_tasks(self, tasks: List[Dict[str, Any]]) -> None:
             # Update the capability class python file
             # Extract str which contains the repr_tasks dictionary
             # TODO: Since these are hardcoded, update when the format changes
-            prefix_str = f"def repr_tasks() -> dict[str, dict]:\n{TAB_W_SPACES}{TAB_W_SPACES}return "
-            suffix_str = f"\n\n{TAB_W_SPACES}@staticmethod\n{TAB_W_SPACES}def get_instructions(t: dict) -> str:"
+            prefix_str = f"def repr_tasks() -> dict[str, dict]:\n{constants.TAB_W_SPACES}{constants.TAB_W_SPACES}return "
+            suffix_str = f"\n\n{constants.TAB_W_SPACES}@staticmethod\n{constants.TAB_W_SPACES}def get_instructions(t: dict) -> str:"
             prev_repr_tasks_str = self.capability_repr_class_str.split(prefix_str)[
                 1
             ].split(suffix_str)[0]
@@ -412,7 +422,7 @@ def _solve_task(
         #   and the answer is incomplete?
         answer_pattern = r"(?i)ANSWER\s*:\s*([^\n]+)"
         match = re.search(answer_pattern, response)
-        answer = match.group(1) if match else NO_ANSWER_STR
+        answer = match.group(1) if match else constants.NO_ANSWER_STR
         metadata = {
             "raw_response": response,
             "api_metadata": metadata,
@@ -466,37 +476,152 @@ def get_tasks(self) -> List[Dict[str, Any]]:
         """
         return self._data
 
-    def _create_inspect_file(self) -> None:
+    def _create_inspect_file(self, path: str) -> None:
         """
         Implement pipeline to evaluate the capability using the inspect framework.
 
         This involves converting the METR format to inspect solvers and scorers.
         """
-        raise NotImplementedError
+        # Create JSONL dataset and store it under the inspect path
+        dataset = self.get_tasks()
+        dataset_metadata_keys = [
+            k for k in list(dataset[0].keys()) if k not in ["id", "problem", "answer"]
+        ]
+        # Write data to a dataset JSONL file
+        with open(os.path.join(path, "dataset.jsonl"), "w") as f:
+            for elm in dataset:
+                f.write(json.dumps(elm) + "\n")
+
+        # Create __init__.py and README files
+        # TODO: Add more details to the README file
+        init_file_content = INSPECT_EVALS_INIT_FILE_TEMPLATE.format(
+            capability_name=self.name,
+        ).strip("\n")
+        with open(os.path.join(path, "__init__.py"), "w") as f:
+            f.write(init_file_content)
+        readme_file_content = INSPECT_EVALS_README_FILE_TEMPLATE.format(
+            capability_name=self.name,
+            capability_description=self.description,
+        ).strip("\n")
+        with open(os.path.join(path, "README.md"), "w") as f:
+            f.write(readme_file_content)
+
+        # Create inspect evals script file
+        # TODO: How to handle more involved score functions?
+        # TODO: Do we need system prompt?
+        instruction_template = self.capability_repr_class.get_instructions(
+            {"problem": "{prompt}"}
+        )
+        score_func_prefix = f"@staticmethod\n{constants.TAB_W_SPACES}def score"
+        score_func_prefix_new = (
+            f"async {score_func_prefix.split(constants.TAB_W_SPACES)[1]}".replace(
+                "score", "_score"
+            )
+        )
+        score_func_str = f"{score_func_prefix_new}{self.capability_repr_class_str.split(score_func_prefix)[1].replace((constants.TAB_W_SPACES + constants.TAB_W_SPACES), constants.TAB_W_SPACES)}".strip(
+            "`"
+        ).strip("\n")
+        script_file_content = INSPECT_EVALS_SCRIPT_FILE_TEMPLATE.format(
+            capability_name=self.name,
+            dataset_metadata_keys=json.dumps(dataset_metadata_keys),
+            prompt_template=instruction_template,
+            score_func_t_dict_str='{"answer": target.text}',
+            score_func_str=score_func_str,
+        )
+        script_file_path = os.path.join(path, f"{self.name}.py")
+        with open(script_file_path, "w") as f:
+            f.write(script_file_content)
+        # TODO: Validate formatting of script file
+        _ = _import_from_path(
+            module_name=f"{self.name}_inspect_eval_script", file_path=script_file_path
+        )
 
-    def _evaluate_using_inspect(self, subject_llm: Model) -> None:  # noqa: D102
+    def _evaluate_using_inspect(self, subject_llm: Model, **kwargs: Any) -> None:
         """
-        Evaluate subject LLM on the capability using the inspect framework.
+        Evaluate the subject LLM on the capability using the Inspect framework.
 
-        Args
-        ----
-        subject_llm : Model
-            The LLM to use for evaluation.
+        This method uses the Inspect evaluation framework to assess the performance of
+        the provided language model (LLM) on a specific capability. It ensures that the
+        required evaluation files exist, temporarily stores logs locally, and transfers
+        them to a GCP bucket after the evaluation is complete.
+
+        Args:
+            subject_llm (Model): The LLM model to evaluate.
+            **kwargs (Any): Additional args for running the evals.
+
+        Raises
+        ------
+            FileNotFoundError: If the required Inspect evaluation path does not exist.
         """
-        raise NotImplementedError
+        inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name)
+        if not os.path.exists(inspect_path):
+            raise FileNotFoundError(
+                f"Inspect evaluation path does not exist: {inspect_path}. "
+                "Please ensure the inspect files are created before evaluation."
+            )
+        # Temporarily store the logs locally and then transfer them to the GCP bucket,
+        # since Inspect does not support GCP bucket paths for storing logs
+        log_dir = os.path.join(
+            self.score_dir.replace(
+                constants.GCP_BASE_ARTIFACTS_DIR, constants.BASE_ARTIFACTS_DIR
+            ),
+            subject_llm.get_model_name(),
+            self.domain,
+            self.name,
+        )
+        os.makedirs(log_dir, exist_ok=True)
 
-    def evaluate(self, subject_llms: List[Model]) -> None:
+        run_inspect_evals(
+            path=self.name,
+            model=subject_llm,
+            log_dir=log_dir,
+            **kwargs,
+        )
+
+        # Transfer the logs to the GCP bucket
+        transfer_inspect_log_to_gcp(
+            src_dir=log_dir,
+            gcp_dir=log_dir.replace(
+                constants.BASE_ARTIFACTS_DIR, constants.GCP_BASE_ARTIFACTS_DIR
+            ),
+        )
+        # Remove the local logs
+        shutil.rmtree(log_dir)
+
+    def evaluate(
+        self, subject_llms: List[Model], gen_args: List[Dict[Any, Any]]
+    ) -> None:
         """
         Evaluate the provided subject LLMs on the capability.
 
         Args
         ----
         subject_llms : List[Model]
             The list of LLMs to use for evaluation.
+        gen_args : List[Dict[Any, Any]]
+            The list of generation configurations corresponding to each LLM.
         """
+        assert len(subject_llms) == len(gen_args), (
+            "Each subject LLM must have a corresponding generation config."
+        )
+        # Create inspect script if evaluating for the first time
+        inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name)
+        if not os.path.exists(inspect_path):
+            os.makedirs(inspect_path)
+            self._create_inspect_file(path=inspect_path)
+
+        # Change dir to where inspect eval scrips are stored
+        # because inspect evals does not support non-relative paths
+        cwd = os.getcwd()
+        os.chdir(constants.BASE_INSPECT_EVALS_DIR)
         # TODO: Run asynchronosly
-        for model in subject_llms:
-            self._evaluate_using_inspect(model)
+        for model_idx, model in enumerate(subject_llms):
+            self._evaluate_using_inspect(
+                subject_llm=model,
+                **gen_args[model_idx],
+            )
+        # Revert to original working dir after evaluation
+        os.chdir(cwd)
 
 
 def _import_from_path(module_name: str, file_path: str) -> Any:
 
@@ -12,14 +12,18 @@ scientist_llm:
       max_tokens: 64
 
 subject_llm:
-  name: Meta-Llama-3.1-70B-Instruct
+  name: gpt-4o-mini # Meta-Llama-3.1-70B-Instruct
+  generation_cfg:
+    temperature: 0.7
+    max_tokens: 8
 
 prompt_cfg:
   sys_msg: Complete the given task to the best of your ability.
 
 capabilities_cfg:
   capabilities_dir: /fs01/projects/aieng/public/ace/artifacts
-  results_dir: /fs01/projects/aieng/public/ace/artifacts
+  results_dir: gs://ace-artifacts
+  inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals
   domain: math
   # Number of seed capabilities to use for initial capability generation
   # Set to -1 to use all seed capabilities
@@ -33,6 +37,9 @@ capabilities_cfg:
   # Set this flag to true to use representative tasks
   # as few shot examples for task generation
   task_gen_few_shot: true
+  # Number of tasks to evaluate for each capability
+  # Set to -1 to evaluate all tasks
+  num_eval_tasks_per_capability: 1
 
 lbo_cfg:
   # Number of capabilities to generate using LBO
 
@@ -8,7 +8,7 @@
 from omegaconf import DictConfig
 
 from capability import CapabilitySeedDataset
-from utils.constants import GSM8K_SCORE_FUNC
+from utils import constants
 from utils.templates import CAPABILITY_CLASS_TEMPLATE
 
 
@@ -229,7 +229,7 @@ def main(cfg: DictConfig) -> None:
                     capability_data=math_tasks["tasks"],
                     capability_repr_tasks=capability_repr_tasks,
                     capability_instructions=capability_instructions,
-                    capability_score_func=GSM8K_SCORE_FUNC.strip(
+                    capability_score_func=constants.GSM8K_SCORE_FUNC.strip(
                         "\n"
                     ),  # TODO: Change this to MATHEMATICS_SCORE_FUNC after figuring out how to implement complex score functions
                     source_dataset=dataset.name,
@@ -266,7 +266,7 @@ def main(cfg: DictConfig) -> None:
                 capability_data=gsm_tasks,
                 capability_repr_tasks=capability_repr_tasks,
                 capability_instructions=capability_instructions,
-                capability_score_func=GSM8K_SCORE_FUNC.strip("\n"),
+                capability_score_func=constants.GSM8K_SCORE_FUNC.strip("\n"),
                 source_dataset=dataset.name,
             )
             print(f"Created capability {capability_name} with {len(gsm_tasks)} tasks.")
 
@@ -7,8 +7,8 @@
 
 from src.capability import Capability
 from src.model import Model
+from src.utils import constants
 from src.utils.capability_utils import extract_and_parse_response
-from src.utils.constants import BASE_ARTIFACTS_DIR
 from src.utils.prompts import (
     CAPABILITY_GENERATION_SYSTEM_PROMPT,
     CAPABILITY_GENERATION_USER_PROMPT,
@@ -159,7 +159,9 @@ def generate_capabilities_using_llm(
         and metadata about the generation process.
     """
     # Select seed capabilities
-    seed_capability_dir = os.path.join(BASE_ARTIFACTS_DIR, "seed_capabilities", domain)
+    seed_capability_dir = os.path.join(
+        constants.BASE_ARTIFACTS_DIR, "seed_capabilities", domain
+    )
     seed_capabilities = _sample_seed_capabilities(
         seed_capability_dir=seed_capability_dir,
         num_seed_capabilities=num_seed_capabilities,
@@ -263,11 +265,13 @@ def generate_capabilities(
     # Set the base capability directory
     if "trial_run" in kwargs:
         base_capability_dir = os.path.join(
-            BASE_ARTIFACTS_DIR, f"capabilities_{kwargs['run_id']}", domain
+            constants.BASE_ARTIFACTS_DIR, f"capabilities_{kwargs['run_id']}", domain
         )
         os.makedirs(base_capability_dir, exist_ok=True)
     else:
-        base_capability_dir = os.path.join(BASE_ARTIFACTS_DIR, "capabilities", domain)
+        base_capability_dir = os.path.join(
+            constants.BASE_ARTIFACTS_DIR, "capabilities", domain
+        )
 
     # Fetch previously generated capabilities, if any
     prev_capabilities = _get_previous_capabilities(capability_dir=base_capability_dir)