Skip to content

Commit 160ca1a

Browse files
authored
Review doc-strings and update instructions
2 parents 04bad1f + 518eb9c commit 160ca1a

16 files changed

+210
-216
lines changed

README.md

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
## 🧑🏿‍💻 Developing
1+
# ACE
22

3-
### Installing dependencies
3+
ACE (Active learning for Capability Evaluation) is a novel framework that uses active learning and powerful language models to automate fine-grained evaluation of foundation models. It enables scalable, adaptive testing that uncovers strengths and weaknesses beyond static benchmarks.
4+
5+
## Installing dependencies
46

57
The development environment can be set up using
68
[poetry](https://python-poetry.org/docs/#installation). Hence, make sure it is
@@ -18,17 +20,17 @@ run:
1820
python3 -m poetry install --with test
1921
```
2022

21-
### [Optional] Google Cloud Authentication
23+
#### [Optional] Google Cloud Authentication
2224

2325
The capability evaluation logs (evaluated using [Inspect](https://inspect.aisi.org.uk/)) are stored in a GCP bucket. Use the following command to log in using your GCP account:
2426

2527
```bash
2628
gcloud auth application-default login
2729
```
2830

29-
### Run pipeline
31+
## Run pipeline
3032

31-
#### Configuration
33+
### Configuration
3234

3335
1. Set environment variables:
3436

@@ -48,19 +50,25 @@ gcloud auth application-default login
4850

4951
2. Modify `src/cfg/run_cfg.yaml`, if required.
5052

51-
#### Capability Generation using the scientist LLM
53+
### Capability Generation using the scientist LLM
54+
55+
Generates capability names and descriptions in the first step. In the second step, for each capability, it generates tasks, solves them, and verifies the solutions.
5256

5357
```bash
5458
python3 src/run_capability_generation.py
5559
```
5660

57-
#### Evaluation of subject LLM on generated capabilities
61+
### Evaluation of subject LLM on generated capabilities
62+
63+
Evaluates the subject LLM on the generated capabilities and calculates a score for each.
5864

5965
```bash
6066
python3 src/run_evaluation.py
6167
```
6268

63-
#### Run active learning pipeline
69+
### Capability selection/generation using active learning
70+
71+
Utilize the capability and the corresponding subject LLM score to select or generate a new capability.
6472

6573
```bash
6674
python3 src/run_lbo.py

example_scripts/train_test_embedding_visualization.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
import logging # noqa: D100
2-
import os # noqa: D100
1+
"""Train and test capability embedding visualization script."""
2+
3+
import logging
4+
import os
35

46
import hydra
57
from omegaconf import DictConfig

src/capability.py

Lines changed: 63 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
import asyncio # noqa: D100
1+
"""Capability class and related utilities."""
2+
3+
import asyncio
24
import importlib
35
import json
46
import logging
@@ -130,12 +132,41 @@ class Capability:
130132
Loads the capability configuration from a JSON file.
131133
_load_capability_repr_class() -> None
132134
Loads the capability representation class from a Python file.
135+
set_state() -> None
136+
Sets the state of the capability.
137+
get_state() -> CapabilityState
138+
Gets the current state of the capability.
139+
load_scores() -> None
140+
Loads scores from JSON files in the specified directory.
141+
get_repr_tasks() -> List[Dict[str, Any]]
142+
Gets the representative tasks for the capability.
143+
add_and_update_tasks() -> None
144+
Adds and/or updates tasks for the capability.
133145
to_dict() -> Dict[str, Any]
134146
Converts the capability attributes to a dictionary.
147+
get_attribute() -> Any
148+
Gets the value of a specific attribute of the capability.
135149
to_json_str() -> str
136150
Converts the capability to a JSON string.
137151
__str__() -> str
138152
Returns a JSON string representation of the capability.
153+
__repr__() -> str
154+
Returns the name of the capability.
155+
set_embedding() -> None
156+
Sets the embedding of the capability based on embedding_name.
157+
get_embedding() -> torch.Tensor
158+
Gets the embedding for the capability.
159+
solve_tasks() -> Tuple[Tuple[List[Dict[str, Any]],
160+
List[Dict[str, Any]]], Dict[str, Any]]
161+
Solves the tasks using the given LLM.
162+
get_tasks() -> List[Dict[str, Any]]
163+
Gets the existing tasks for the capability.
164+
_create_inspect_file() -> None
165+
Creates the inspect file for the capability.
166+
_evaluate_using_inspect() -> None
167+
Evaluates the capability using the inspect framework.
168+
evaluate() -> None
169+
Evaluates the capability using the inspect framework.
139170
"""
140171

141172
def __init__(
@@ -186,6 +217,7 @@ def from_dict(
186217
the capability attributes.
187218
base_dir (str): The base directory where the capability
188219
directory will be created
220+
score_dir_suffix (str | None): Optional suffix for the score directory.
189221
190222
Returns
191223
-------
@@ -255,7 +287,6 @@ def _load_capability_json(self) -> None:
255287
self.domain = _cfg["capability_domain"]
256288
self.instructions = _cfg["capability_instructions"]
257289
self.area = _cfg.get("capability_area", None)
258-
# TODO: Store data is stored in json or elsewhere?
259290
self._data: List[Dict[str, Any]] = _cfg["capability_data"]
260291
self._failed_data: List[Dict[str, Any]] = _cfg.get("capability_failed_data", [])
261292
# Check if the capability is a seed capability, use source_dataset as indicator
@@ -342,11 +373,6 @@ def load_scores(
342373
Defaults to -1 (all tasks).
343374
seed (int): The random seed for reproducibility.
344375
Defaults to the constant DEFAULT_RANDOM_SEED.
345-
346-
Returns
347-
-------
348-
Dict[str, Any]: A dictionary where the keys are model names and
349-
the values are dictionaries containing the scores and metadata.
350376
"""
351377
scores_dir = scores_dir if scores_dir else self.score_dir
352378
scores_dict: defaultdict[str, dict[str, Any]] = defaultdict(dict)
@@ -414,6 +440,7 @@ def add_and_update_tasks(
414440
failed_tasks (List[Dict[str, Any]]): A list of dictionaries
415441
containing the tasks that failed to be solved.
416442
Each task dict consists of id, problem, and answer keys.
443+
seed (int): The random seed for reproducibility.
417444
"""
418445
random.seed(seed)
419446

@@ -549,7 +576,8 @@ def to_dict(self, attribute_names: List[str] | None = None) -> Dict[str, Any]:
549576
"""
550577
Return a dictionary of the capability attributes.
551578
552-
Args:
579+
Args
580+
----
553581
attribute_names (List[str] | None, optional): the list of attribute
554582
names requested. If none, return a set of default attributes.
555583
Defaults to None.
@@ -590,6 +618,12 @@ def to_json_str(self, attribute_names: List[str] | None = None) -> str:
590618
"""
591619
Convert the capability to a JSON string.
592620
621+
Args
622+
----
623+
attribute_names (List[str] | None, optional): the list of attribute
624+
names requested. If none, return a set of default attributes.
625+
Defaults to None.
626+
593627
Returns
594628
-------
595629
str
@@ -639,10 +673,6 @@ def set_embedding(
639673
----
640674
embedding_name (str): The name of the embedding model/algorithm.
641675
embedding_vector (torch.Tensor): The embedding vector to set.
642-
643-
Returns
644-
-------
645-
None
646676
"""
647677
self.embedding_dict[embedding_name] = embedding_tensor
648678

@@ -859,6 +889,14 @@ def _create_inspect_file(
859889
Implement pipeline to evaluate the capability using the inspect framework.
860890
861891
This involves converting the METR format to inspect solvers and scorers.
892+
893+
Args
894+
----
895+
path (str): The path to the directory where the inspect files
896+
will be created.
897+
judge_llm_name (str | None): The name of the judge LLM to use.
898+
judge_llm_gen_args (Dict[str, Any] | None): Additional generation arguments
899+
for the judge LLM.
862900
"""
863901
# Create JSONL dataset and store it under the inspect path
864902
dataset = self.get_tasks()
@@ -894,8 +932,7 @@ def _create_inspect_file(
894932
utils_file_contents = f.read()
895933
# Update judge LLM if provided
896934
# NOTE: Judge LLM does not support local models (hosted using vector inference)
897-
# TODO: Add support for local models? Not required,
898-
# since we will rarely use open source LLMs as judge LLMs
935+
# TODO: Add support for local models?
899936
if judge_llm_name is not None:
900937
utils_file_contents = utils_file_contents.replace(
901938
'INSPECT_JUDGE_LLM = "openai/gpt-4o-mini"',
@@ -911,7 +948,6 @@ def _create_inspect_file(
911948
f.write(utils_file_contents)
912949

913950
# 2. Construct inspect evals script file
914-
# TODO: Do we need system prompt?
915951
instruction_template = self.capability_repr_class.get_instructions(
916952
{"problem": "{prompt}"}
917953
)
@@ -969,7 +1005,8 @@ def _evaluate_using_inspect(self, subject_llm: Model, **kwargs: Any) -> None:
9691005
required evaluation files exist, temporarily stores logs locally, and transfers
9701006
them to a GCP bucket after the evaluation is complete.
9711007
972-
Args:
1008+
Args
1009+
----
9731010
subject_llm (Model): The LLM model to evaluate.
9741011
**kwargs (Any): Additional args for running the evals.
9751012
@@ -1032,16 +1069,14 @@ def evaluate(
10321069
10331070
Args
10341071
----
1035-
subject_llms : List[Model]
1036-
The list of LLMs to use for evaluation.
1037-
gen_args : List[Dict[Any, Any]]
1038-
The list of generation configurations corresponding to each LLM.
1039-
judge_llm : Model | None
1040-
The judge LLM to use for evaluation. If None, no judge LLM is used.
1041-
judge_llm_gen_args : Dict[str, Any] | None
1042-
The generation configuration for the judge LLM. If None, defaults are used.
1043-
**kwargs : Any
1044-
Additional arguments for the evaluation.
1072+
subject_llms (List[Model]): The list of LLMs to use for evaluation.
1073+
gen_args (List[Dict[Any, Any]]): The list of generation configurations
1074+
corresponding to each LLM.
1075+
judge_llm (Model | None): The judge LLM to use for evaluation. If None,
1076+
no judge LLM is used.
1077+
judge_llm_gen_args (Dict[str, Any] | None): The generation configuration
1078+
for the judge LLM. If None, defaults are used.
1079+
**kwargs (Any): Additional arguments for the evaluation.
10451080
"""
10461081
assert len(subject_llms) == len(gen_args), (
10471082
"Each subject LLM must have a corresponding generation config."
@@ -1051,7 +1086,6 @@ def evaluate(
10511086
inspect_path = os.path.join(constants.BASE_INSPECT_EVALS_DIR, self.name)
10521087
if os.path.exists(inspect_path):
10531088
# Recreating the inspect file to avoid an unknown path error
1054-
# TODO: Resolve the unknown path error?
10551089
# Remove existing inspect path to avoid conflicts
10561090
shutil.rmtree(inspect_path)
10571091
os.makedirs(inspect_path)
@@ -1074,7 +1108,6 @@ def evaluate(
10741108
cwd = os.getcwd()
10751109
os.chdir(constants.BASE_INSPECT_EVALS_DIR)
10761110
sys.path.append(constants.BASE_INSPECT_EVALS_DIR)
1077-
# TODO: Run asynchronosly
10781111
for model_idx, model in enumerate(subject_llms):
10791112
try:
10801113
self._evaluate_using_inspect(
@@ -1112,7 +1145,8 @@ def _import_from_path(module_name: str, file_path: str) -> Any:
11121145
11131146
This is a helper function for loading the capability.py file as a module.
11141147
1115-
Args:
1148+
Args
1149+
----
11161150
module_name (str): The name to assign to the imported module.
11171151
file_path (str): The file path to the module to be imported.
11181152

src/create_seed_capabilities.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
import json # noqa: D100
1+
"""Create seed capabilities for the mathematics and GSM8K datasets."""
2+
3+
import json
24
import logging
35
import os
46
import random
57
import shutil
68
from collections import defaultdict
79
from typing import Any, Dict, List
810

9-
import hydra # noqa: D100
11+
import hydra
1012
from omegaconf import DictConfig
1113

1214
from capability import CapabilitySeedDataset
@@ -33,7 +35,8 @@ def populate_seed_capability_dir(
3335
3436
Create a JSON configuration and a Python script.
3537
36-
Args:
38+
Args
39+
----
3740
base_dir (str): The base directory where the capability directory
3841
will be created.
3942
capability_name (str): The name of the capability.
@@ -45,10 +48,7 @@ def populate_seed_capability_dir(
4548
capability_instructions (str): Instructions for the capability.
4649
capability_score_func (str): The scoring function for the capability.
4750
source_dataset (str): The name of the source dataset.
48-
49-
Returns
50-
-------
51-
None
51+
capability_subject (str | None): The subject of the capability.
5252
"""
5353
# Create capability dir
5454
capability_dir = os.path.join(base_dir, capability_name)
@@ -114,7 +114,8 @@ def remove_boxed(s: str) -> str:
114114
2. If the string starts with "\\boxed{" and ends with "}", it removes these
115115
enclosing characters.
116116
117-
Args:
117+
Args
118+
----
118119
s (str): The input string containing the LaTeX boxed notation.
119120
120121
Returns
@@ -149,7 +150,8 @@ def last_boxed_only_string(string: str) -> str | None:
149150
the last occurrence of these box commands.
150151
If no such boxed substring is found, it returns None.
151152
152-
Args:
153+
Args
154+
----
153155
string (str): The input string to search for boxed substrings.
154156
155157
Returns
@@ -185,7 +187,8 @@ def main(cfg: DictConfig) -> None:
185187
"""
186188
Create seed capabilities based on the provided configuration.
187189
188-
Args:
190+
Args
191+
----
189192
cfg (DictConfig): Configuration object containing capability settings.
190193
191194
The function processes capabilities from the configuration and

0 commit comments

Comments
 (0)