VectorInstitute
diff --git a/‎README.md‎
Lines changed: 16 additions & 8 deletions b/‎README.md‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎example_scripts/train_test_embedding_visualization.py‎
Lines changed: 4 additions & 2 deletions b/‎example_scripts/train_test_embedding_visualization.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/capability.py‎
Lines changed: 63 additions & 29 deletions b/‎src/capability.py‎
Lines changed: 63 additions & 29 deletions
diff --git a/‎src/create_seed_capabilities.py‎
Lines changed: 13 additions & 10 deletions b/‎src/create_seed_capabilities.py‎
Lines changed: 13 additions & 10 deletions
@@ -1,6 +1,8 @@
-## 🧑🏿‍💻 Developing
+# ACE
 
-### Installing dependencies
+ACE (Active learning for Capability Evaluation) is a novel framework that uses active learning and powerful language models to automate fine-grained evaluation of foundation models. It enables scalable, adaptive testing that uncovers strengths and weaknesses beyond static benchmarks.
+
+## Installing dependencies
 
 The development environment can be set up using
 [poetry](https://python-poetry.org/docs/#installation). Hence, make sure it is
@@ -18,17 +20,17 @@ run:
 python3 -m poetry install --with test
 ```
 
-### [Optional] Google Cloud Authentication
+#### [Optional] Google Cloud Authentication
 
 The capability evaluation logs (evaluated using [Inspect](https://inspect.aisi.org.uk/)) are stored in a GCP bucket. Use the following command to log in using your GCP account:
 
 ```bash
 gcloud auth application-default login
 ```
 
-### Run pipeline
+## Run pipeline
 
-#### Configuration
+### Configuration
 
 1. Set environment variables:
 
@@ -48,19 +50,25 @@ gcloud auth application-default login
 
 2. Modify `src/cfg/run_cfg.yaml`, if required.
 
-#### Capability Generation using the scientist LLM
+### Capability Generation using the scientist LLM
+
+Generates capability names and descriptions in the first step. In the second step, for each capability, it generates tasks, solves them, and verifies the solutions.
 
 ```bash
 python3 src/run_capability_generation.py
 ```
 
-#### Evaluation of subject LLM on generated capabilities
+### Evaluation of subject LLM on generated capabilities
+
+Evaluates the subject LLM on the generated capabilities and calculates a score for each.
 
 ```bash
 python3 src/run_evaluation.py
 ```
 
-#### Run active learning pipeline
+### Capability selection/generation using active learning
+
+Utilize the capability and the corresponding subject LLM score to select or generate a new capability.
 
 ```bash
 python3 src/run_lbo.py
 
@@ -1,5 +1,7 @@
-import logging  # noqa: D100
-import os  # noqa: D100
+"""Train and test capability embedding visualization script."""
+
+import logging
+import os
 
 import hydra
 from omegaconf import DictConfig
 
@@ -1,4 +1,6 @@
-import asyncio  # noqa: D100
+"""Capability class and related utilities."""
+
+import asyncio
 import importlib
 import json
 import logging
@@ -130,12 +132,41 @@ class Capability:
         Loads the capability configuration from a JSON file.
     _load_capability_repr_class() -> None
         Loads the capability representation class from a Python file.
+    set_state() -> None
+        Sets the state of the capability.
+    get_state() -> CapabilityState
+        Gets the current state of the capability.
+    load_scores() -> None
+        Loads scores from JSON files in the specified directory.
+    get_repr_tasks() -> List[Dict[str, Any]]
+        Gets the representative tasks for the capability.
+    add_and_update_tasks() -> None
+        Adds and/or updates tasks for the capability.
     to_dict() -> Dict[str, Any]
         Converts the capability attributes to a dictionary.
+    get_attribute() -> Any
+        Gets the value of a specific attribute of the capability.
     to_json_str() -> str
         Converts the capability to a JSON string.
     __str__() -> str
         Returns a JSON string representation of the capability.
+    __repr__() -> str
+        Returns the name of the capability.
+    set_embedding() -> None
+        Sets the embedding of the capability based on embedding_name.
+    get_embedding() -> torch.Tensor
+        Gets the embedding for the capability.
+    solve_tasks() -> Tuple[Tuple[List[Dict[str, Any]],
+                    List[Dict[str, Any]]], Dict[str, Any]]
+        Solves the tasks using the given LLM.
+    get_tasks() -> List[Dict[str, Any]]
+        Gets the existing tasks for the capability.
+    _create_inspect_file() -> None
+        Creates the inspect file for the capability.
+    _evaluate_using_inspect() -> None
+        Evaluates the capability using the inspect framework.
+    evaluate() -> None
+        Evaluates the capability using the inspect framework.
     """
 
     def __init__(
@@ -186,6 +217,7 @@ def from_dict(
             the capability attributes.
         base_dir (str): The base directory where the capability
             directory will be created
+        score_dir_suffix (str | None): Optional suffix for the score directory.
 
         Returns
         -------
@@ -255,7 +287,6 @@ def _load_capability_json(self) -> None:
         self.domain = _cfg["capability_domain"]
         self.instructions = _cfg["capability_instructions"]
         self.area = _cfg.get("capability_area", None)
-        # TODO: Store data is stored in json or elsewhere?
         self._data: List[Dict[str, Any]] = _cfg["capability_data"]
         self._failed_data: List[Dict[str, Any]] = _cfg.get("capability_failed_data", [])
         # Check if the capability is a seed capability, use source_dataset as indicator
@@ -342,11 +373,6 @@ def load_scores(
                 Defaults to -1 (all tasks).
             seed (int): The random seed for reproducibility.
                 Defaults to the constant DEFAULT_RANDOM_SEED.
-
-        Returns
-        -------
-            Dict[str, Any]: A dictionary where the keys are model names and
-            the values are dictionaries containing the scores and metadata.
         """
         scores_dir = scores_dir if scores_dir else self.score_dir
         scores_dict: defaultdict[str, dict[str, Any]] = defaultdict(dict)
@@ -414,6 +440,7 @@ def add_and_update_tasks(
             failed_tasks (List[Dict[str, Any]]): A list of dictionaries
                 containing the tasks that failed to be solved.
                 Each task dict consists of id, problem, and answer keys.
+            seed (int): The random seed for reproducibility.
         """
         random.seed(seed)
 
@@ -549,7 +576,8 @@ def to_dict(self, attribute_names: List[str] | None = None) -> Dict[str, Any]:
         """
         Return a dictionary of the capability attributes.
 
-        Args:
+        Args
+        ----
             attribute_names (List[str] | None, optional): the list of attribute
             names requested. If none, return a set of default attributes.
             Defaults to None.
@@ -590,6 +618,12 @@ def to_json_str(self, attribute_names: List[str] | None = None) -> str:
         """
         Convert the capability to a JSON string.
 
+        Args
+        ----
+            attribute_names (List[str] | None, optional): the list of attribute
+            names requested. If none, return a set of default attributes.
+            Defaults to None.
+
         Returns
         -------
         str
@@ -639,10 +673,6 @@ def set_embedding(
         ----
             embedding_name (str): The name of the embedding model/algorithm.
             embedding_vector (torch.Tensor): The embedding vector to set.
-
-        Returns
-        -------
-            None
         """
         self.embedding_dict[embedding_name] = embedding_tensor
 
@@ -859,6 +889,14 @@ def _create_inspect_file(
         Implement pipeline to evaluate the capability using the inspect framework.
 
         This involves converting the METR format to inspect solvers and scorers.
+
+        Args
+        ----
+            path (str): The path to the directory where the inspect files
+                will be created.
+            judge_llm_name (str | None): The name of the judge LLM to use.
+            judge_llm_gen_args (Dict[str, Any] | None): Additional generation arguments
+                for the judge LLM.
         """
         # Create JSONL dataset and store it under the inspect path
         dataset = self.get_tasks()
@@ -894,8 +932,7 @@ def _create_inspect_file(
             utils_file_contents = f.read()
         # Update judge LLM if provided
         # NOTE: Judge LLM does not support local models (hosted using vector inference)
-        # TODO: Add support for local models? Not required,
-        # since we will rarely use open source LLMs as judge LLMs
+        # TODO: Add support for local models?
         if judge_llm_name is not None:
             utils_file_contents = utils_file_contents.replace(
                 'INSPECT_JUDGE_LLM = "openai/gpt-4o-mini"',
@@ -911,7 +948,6 @@ def _create_inspect_file(
             f.write(utils_file_contents)
 
         # 2. Construct inspect evals script file
-        # TODO: Do we need system prompt?
         instruction_template = self.capability_repr_class.get_instructions(
             {"problem": "{prompt}"}
         )
@@ -969,7 +1005,8 @@ def _evaluate_using_inspect(self, subject_llm: Model, **kwargs: Any) -> None:
         required evaluation files exist, temporarily stores logs locally, and transfers
         them to a GCP bucket after the evaluation is complete.
 
-        Args:
+        Args
+        ----
             subject_llm (Model): The LLM model to evaluate.
             **kwargs (Any): Additional args for running the evals.
 
@@ -1032,16 +1069,14 @@ def evaluate(
 
         Args
         ----
-        subject_llms : List[Model]
-            The list of LLMs to use for evaluation.
-        gen_args : List[Dict[Any, Any]]
-            The list of generation configurations corresponding to each LLM.
-        judge_llm : Model | None
-            The judge LLM to use for evaluation. If None, no judge LLM is used.
-        judge_llm_gen_args : Dict[str, Any] | None
-            The generation configuration for the judge LLM. If None, defaults are used.
-        **kwargs : Any
-            Additional arguments for the evaluation.
+            subject_llms (List[Model]): The list of LLMs to use for evaluation.
+            gen_args (List[Dict[Any, Any]]): The list of generation configurations
+                corresponding to each LLM.
+            judge_llm (Model | None): The judge LLM to use for evaluation. If None,
+                no judge LLM is used.
+            judge_llm_gen_args (Dict[str, Any] | None): The generation configuration
+                for the judge LLM. If None, defaults are used.
+            **kwargs (Any): Additional arguments for the evaluation.
         """
         assert len(subject_llms) == len(gen_args), (
             "Each subject LLM must have a corresponding generation config."
@@ -1051,7 +1086,6 @@ def evaluate(
             inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name)
             if os.path.exists(inspect_path):
                 # Recreating the inspect file to avoid an unknown path error
-                # TODO: Resolve the unknown path error?
                 # Remove existing inspect path to avoid conflicts
                 shutil.rmtree(inspect_path)
             os.makedirs(inspect_path)
@@ -1074,7 +1108,6 @@ def evaluate(
         cwd = os.getcwd()
         os.chdir(constants.BASE_INSPECT_EVALS_DIR)
         sys.path.append(constants.BASE_INSPECT_EVALS_DIR)
-        # TODO: Run asynchronosly
         for model_idx, model in enumerate(subject_llms):
             try:
                 self._evaluate_using_inspect(
@@ -1112,7 +1145,8 @@ def _import_from_path(module_name: str, file_path: str) -> Any:
 
     This is a helper function for loading the capability.py file as a module.
 
-    Args:
+    Args
+    ----
         module_name (str): The name to assign to the imported module.
         file_path (str): The file path to the module to be imported.
 
 
@@ -1,12 +1,14 @@
-import json  # noqa: D100
+"""Create seed capabilities for the mathematics and GSM8K datasets."""
+
+import json
 import logging
 import os
 import random
 import shutil
 from collections import defaultdict
 from typing import Any, Dict, List
 
-import hydra  # noqa: D100
+import hydra
 from omegaconf import DictConfig
 
 from capability import CapabilitySeedDataset
@@ -33,7 +35,8 @@ def populate_seed_capability_dir(
 
     Create a JSON configuration and a Python script.
 
-    Args:
+    Args
+    ----
         base_dir (str): The base directory where the capability directory
             will be created.
         capability_name (str): The name of the capability.
@@ -45,10 +48,7 @@ def populate_seed_capability_dir(
         capability_instructions (str): Instructions for the capability.
         capability_score_func (str): The scoring function for the capability.
         source_dataset (str): The name of the source dataset.
-
-    Returns
-    -------
-        None
+        capability_subject (str | None): The subject of the capability.
     """
     # Create capability dir
     capability_dir = os.path.join(base_dir, capability_name)
@@ -114,7 +114,8 @@ def remove_boxed(s: str) -> str:
     2. If the string starts with "\\boxed{" and ends with "}", it removes these
        enclosing characters.
 
-    Args:
+    Args
+    ----
         s (str): The input string containing the LaTeX boxed notation.
 
     Returns
@@ -149,7 +150,8 @@ def last_boxed_only_string(string: str) -> str | None:
     the last occurrence of these box commands.
     If no such boxed substring is found, it returns None.
 
-    Args:
+    Args
+    ----
         string (str): The input string to search for boxed substrings.
 
     Returns
@@ -185,7 +187,8 @@ def main(cfg: DictConfig) -> None:
     """
     Create seed capabilities based on the provided configuration.
 
-    Args:
+    Args
+    ----
         cfg (DictConfig): Configuration object containing capability settings.
 
     The function processes capabilities from the configuration and