Skip to content

Commit e13002a

Browse files
authored
Merge pull request #9 from VectorInstitute/develop
Implement code skeleton for the entire new capability generation pipeline
2 parents 59274b6 + 8614615 commit e13002a

File tree

12 files changed

+581
-81
lines changed

12 files changed

+581
-81
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ ignore = [
125125
[tool.ruff.lint.per-file-ignores]
126126
"__init__.py" = ["E402", "F401", "F403", "F811"]
127127
"tests/src/seed_capabilities/math/math_competition_algebra/capability.py" = ["D100", "D101", "D102"]
128+
"src/run.py" = ["ERA001"]
129+
"src/lbo.py" = ["ERA001"]
128130

129131
[tool.ruff.lint.pep8-naming]
130132
ignore-names = ["X*", "setUp"]

src/capability.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
import sys
55
from collections import defaultdict
6-
from typing import Any, Dict
6+
from typing import Any, Dict, List
77

88
from src.model import Model
99
from src.utils.capability_utils import parse_python_class_str, read_score_inspect_json
@@ -252,10 +252,38 @@ def encode(self, encoder_model: Any) -> None:
252252
self.encoding = None
253253
raise NotImplementedError
254254

255-
def evaluate_using_inspect(self, model: Model) -> None: # noqa: D102
256-
# evaluate the capability using inspect-evals
255+
def _create_inspect_file(self) -> None:
256+
"""
257+
Implement pipeline to evaluate the capability using the inspect framework.
258+
259+
This involves converting the METR format to inspect solvers and scorers.
260+
"""
261+
raise NotImplementedError
262+
263+
def _evaluate_using_inspect(self, subject_llm: Model) -> None: # noqa: D102
264+
"""
265+
Evaluate subject LLM on the capability using the inspect framework.
266+
267+
Args
268+
----
269+
subject_llm : Model
270+
The LLM to use for evaluation.
271+
"""
257272
raise NotImplementedError
258273

274+
def evaluate(self, subject_llms: List[Model]) -> None:
275+
"""
276+
Evaluate the provided subject LLMs on the capability.
277+
278+
Args
279+
----
280+
subject_llms : List[Model]
281+
The list of LLMs to use for evaluation.
282+
"""
283+
# TODO: Run asynchronosly
284+
for model in subject_llms:
285+
self._evaluate_using_inspect(model)
286+
259287

260288
def _import_from_path(module_name: str, file_path: str) -> Any:
261289
"""

src/cfg/run_cfg.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
generator_model:
1+
scientist_llm:
22
name: gpt-4o-mini
33
gen_cfg:
44
temperature: 0.7
55
max_tokens: 64
66

7-
candidate_model:
7+
subject_llm:
88
name: Meta-Llama-3.1-70B-Instruct
99

1010
prompt_cfg:
@@ -17,6 +17,16 @@ capabilities_cfg:
1717
num_seed_capabilities: -1
1818
num_gen_capabilities: 4
1919
num_gen_capabilities_per_run: 2
20+
num_gen_tasks_per_capability: 2
21+
22+
lbo_cfg:
23+
# Number of capabilities to generate using LBO
24+
num_lbo_runs: 1
25+
# Type of LBO pipeline to use
26+
pipeline_id: "nearest_neighbor" # "nearest_neighbor" or "discover_new"
27+
# Train args for 'nearest_neighbor' pipeline
28+
train_frac: 0.5
29+
min_train_size: 10
2030

2131
exp_cfg:
2232
# Set this flag to true to run test experiments during development

src/generate_capabilities.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
def _sample_seed_capabilities(
1919
seed_capability_dir: str,
2020
num_seed_capabilities: int = -1,
21-
include_capabilities: List[str] | None = None,
21+
include_capability_names: List[str] | None = None,
2222
random_seed: int = 42,
2323
) -> List[Capability]:
2424
"""
@@ -31,7 +31,8 @@ def _sample_seed_capabilities(
3131
----
3232
seed_capability_dir (str): The directory containing the seed capabilities.
3333
num_seed_capabilities (int): The number of seed capabilities to sample.
34-
include_capabilities (List[str] | None): A list of capability names to include.
34+
include_capability_names (List[str] | None): A list of
35+
capability names to include.
3536
random_seed (int): The seed for the random number generator.
3637
3738
Returns
@@ -46,21 +47,21 @@ def _sample_seed_capabilities(
4647
# Select all capabilities if num_seed_capabilities is -1
4748
if num_seed_capabilities == -1:
4849
num_seed_capabilities = len(all_seed_capability_paths)
49-
include_capabilities = None
50+
include_capability_names = None
5051

5152
# Force include some capabilities
52-
if include_capabilities is not None:
53-
assert num_seed_capabilities >= len(include_capabilities), (
53+
if include_capability_names is not None:
54+
assert num_seed_capabilities >= len(include_capability_names), (
5455
"Number of seed capabilities is less than the number of capabilities to include."
5556
)
56-
for capability_name in include_capabilities:
57+
for capability_name in include_capability_names:
5758
assert os.path.exists(os.path.join(seed_capability_dir, capability_name)), (
5859
f"{capability_name} does not exist in {seed_capability_dir}."
5960
)
6061
capability = Capability(os.path.join(seed_capability_dir, capability_name))
6162
sampled_seed_capabilities.append(capability)
6263
all_seed_capability_paths.remove(capability_name)
63-
num_seed_capabilities -= len(include_capabilities)
64+
num_seed_capabilities -= len(include_capability_names)
6465

6566
# TODO: Enhance the selection criterion
6667
for capability_path in random.sample(
@@ -121,10 +122,10 @@ def generate_capabilities_using_llm(
121122
sys_prompt: str,
122123
user_prompt: str,
123124
num_seed_capabilities: int,
124-
prev_capabilities: List[str],
125+
prev_capabilities: List[Capability],
125126
scientist_llm_gen_cfg: Dict[str, Any],
126127
base_capability_dir: str,
127-
include_seed_capabilities: Optional[List[str]] = None,
128+
include_seed_capability_names: Optional[List[str]] = None,
128129
**kwargs: Any,
129130
) -> Dict[str, Any]:
130131
"""
@@ -142,25 +143,27 @@ def generate_capabilities_using_llm(
142143
sys_prompt (str): The system prompt.
143144
user_prompt (str): The user prompt.
144145
num_seed_capabilities (int): The number of seed capabilities to use.
145-
prev_capabilities (List[str]): The list of previously
146-
generated capability names.
146+
prev_capabilities (List[Capability]): The list of previously
147+
generated capabilities.
147148
scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration
148149
for the scientist LLM.
149150
base_capability_dir (str): The base directory to store
150151
the generated capabilities for the specified domain.
151-
include_seed_capabilities (List[str] | None): A list of seed capability
152+
include_seed_capability_names (List[str] | None): A list of seed capability
152153
names to include in the generation process.
154+
**kwargs (Any): Additional keyword arguments.
153155
154156
Returns
155157
-------
156-
List[str]: The generated capability names.
158+
Dict[str, Any]: A dictionary containing the generated capabilities
159+
and metadata about the generation process.
157160
"""
158161
# Select seed capabilities
159162
seed_capability_dir = os.path.join(BASE_ARTIFACTS_DIR, "seed_capabilities", domain)
160163
seed_capabilities = _sample_seed_capabilities(
161164
seed_capability_dir=seed_capability_dir,
162165
num_seed_capabilities=num_seed_capabilities,
163-
include_capabilities=include_seed_capabilities,
166+
include_capability_names=include_seed_capability_names,
164167
)
165168
# Get capability JSON strings (without scores)
166169
seed_capabilities_repr = [
@@ -170,7 +173,7 @@ def generate_capabilities_using_llm(
170173
# LLM input
171174
user_prompt = user_prompt.format(
172175
seed_capabilities="\n".join(seed_capabilities_repr),
173-
prev_capabilities="\n".join(prev_capabilities),
176+
prev_capabilities="\n".join([elm.name for elm in prev_capabilities]),
174177
domain=domain,
175178
num_gen_capabilities=num_capabilities,
176179
)
@@ -193,10 +196,9 @@ def generate_capabilities_using_llm(
193196
Capability.from_dict(capability_dict=capability, base_dir=base_capability_dir)
194197
for capability in gen_capabilities
195198
]
196-
gen_capabilities_names = [elm.name for elm in gen_capabilities]
197199

198200
return {
199-
"capabilities": gen_capabilities_names,
201+
"capabilities": gen_capabilities,
200202
"metadata": {
201203
"model": scientist_llm.get_model_name(),
202204
"thought": parsed_response["thought"],
@@ -206,20 +208,20 @@ def generate_capabilities_using_llm(
206208

207209

208210
def filter_capabilities(
209-
capabilities: List[str],
210-
) -> List[str]:
211+
capabilities: List[Capability],
212+
) -> List[Capability]:
211213
"""
212214
Filter capabilities based on multiple criterion.
213215
214216
Remove repeated, irrelevant, and ill-formed capabilities.
215217
216218
Args
217219
----
218-
capabilities (List[str]): The list of capabilities.
220+
capabilities (List[Capability]): The list of capabilities.
219221
220222
Returns
221223
-------
222-
List[str]: The filtered capability names.
224+
List[Capability]: The list of remaining capabilities.
223225
"""
224226
# TODO: Implement capability filtering
225227
return capabilities
@@ -232,9 +234,9 @@ def generate_capabilities(
232234
scientist_llm: Model,
233235
num_seed_capabilities: int,
234236
scientist_llm_gen_cfg: Dict[str, Any],
235-
include_seed_capabilities: Optional[List[str]] = None,
237+
include_seed_capability_names: Optional[List[str]] = None,
236238
**kwargs: Any,
237-
) -> List[str]:
239+
) -> List[Capability]:
238240
"""
239241
Generate initial capabilities for the specified domain.
240242
@@ -247,12 +249,12 @@ def generate_capabilities(
247249
num_seed_capabilities (int): The number of seed capabilities to use.
248250
scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration
249251
for the scientist LLM.
250-
include_seed_capabilities (List[str] | None): A list of seed capability
252+
include_seed_capability_names (List[str] | None): A list of seed capability
251253
names to include in the generation process.
252254
253255
Returns
254256
-------
255-
List[str]: The generated capability names.
257+
List[Capability]: The generated capabilities.
256258
"""
257259
num_runs = int(np.ceil(num_capabilities / num_capabilities_per_run))
258260
gen_capabilities = []
@@ -268,10 +270,7 @@ def generate_capabilities(
268270
base_capability_dir = os.path.join(BASE_ARTIFACTS_DIR, "capabilities", domain)
269271

270272
# Fetch previously generated capabilities, if any
271-
prev_capabilities = [
272-
elm.name
273-
for elm in _get_previous_capabilities(capability_dir=base_capability_dir)
274-
]
273+
prev_capabilities = _get_previous_capabilities(capability_dir=base_capability_dir)
275274

276275
for run_id in range(num_runs):
277276
print("Run ID:", run_id)
@@ -286,7 +285,7 @@ def generate_capabilities(
286285
prev_capabilities=prev_capabilities,
287286
scientist_llm_gen_cfg=scientist_llm_gen_cfg,
288287
base_capability_dir=base_capability_dir,
289-
include_seed_capabilities=include_seed_capabilities,
288+
include_seed_capability_names=include_seed_capability_names,
290289
**kwargs,
291290
)
292291
gen_capabilities.extend(response["capabilities"])

src/generate_tasks.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from typing import Any, Dict # noqa: D100
2+
3+
from capability import Capability
4+
from model import Model
5+
6+
7+
def generate_tasks_using_llm(
8+
capability: Capability,
9+
scientist_llm: Model,
10+
sys_prompt: str,
11+
user_prompt: str,
12+
num_tasks: int,
13+
scientist_llm_gen_cfg: Dict[str, Any],
14+
) -> None:
15+
"""
16+
Generate `num_tasks` tasks for the given capability.
17+
18+
Generate tasks for the given capability
19+
using the scientist LLM model based on the following approach:
20+
<Approach>
21+
22+
Args
23+
----
24+
capability (Capability): The capability to generate tasks for.
25+
scientist_llm (Model): The scientist LLM model.
26+
sys_prompt (str): The system prompt for generating tasks.
27+
user_prompt (str): The user prompt for generating tasks.
28+
num_tasks (int): The number of tasks to generate.
29+
scientist_llm_gen_cfg (Dict[str, Any]): The generation configuration
30+
for the scientist LLM.
31+
"""
32+
# TODO: Implement the function with the following components
33+
# # Approach 1
34+
# 1. First generate task questions. This can be done in two ways:
35+
# a. Single run to generate all `num_tasks` (Nt) questions
36+
# - input tokens: Pt
37+
# - output tokens: Nt * Qt, where Qt is the mean # tokens in a question
38+
# b. Multiple runs to generate `num_tasks` (Nt)
39+
# questions in batches of `num_tasks_per_run` (Ntr)
40+
# - input tokens: Pt * Ntr
41+
# - output tokens: Nt * Qt
42+
# 2. Filter out similar/ill-formatted questions
43+
# 3. Then obtain task answers by:
44+
# a. prompting the scientist LLM to solve these selected questions
45+
# b. using a group of (less capable) models to solve
46+
# these questions and then selecting the majority answer
47+
# c. using a scoring function
48+
#
49+
# # Approach 2
50+
# 1. Generate task questions and answers together in a single run.
51+
# Again, this can be done in two ways described above.
52+
# 2. Filter out similar/ill-formatted question/asnwer pairs
53+
# 3. Verify each pair by:
54+
# a. prompting the scientist LLM to function as a judge
55+
# b. using a group of (less capable) models to judge and
56+
# then selecting the majority answer
57+
58+
raise NotImplementedError

src/get_seed_capability_results.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def main(cfg: DictConfig) -> None:
105105
3. Reads the capability configuration from the "capability.json" file.
106106
4. Determines the dataset name and capability details
107107
from the capability configuration.
108-
5. Iterates over results for all candidate models
108+
5. Iterates over results for all subject models
109109
in the seed datasets log directory.
110110
6. For each log file that matches the dataset name,
111111
processes the log file based on the dataset type:
@@ -137,23 +137,23 @@ def main(cfg: DictConfig) -> None:
137137
if dataset_name == "math":
138138
subject = capability_json["capability_subject"]
139139

140-
# Iterate over results for all candidate models
141-
for candidate_model_dir in os.listdir(seed_datasets_log_dir):
142-
candidate_model_log_path = os.path.join(
143-
seed_datasets_log_dir, candidate_model_dir
140+
# Iterate over results for all subject models
141+
for subject_model_dir in os.listdir(seed_datasets_log_dir):
142+
subject_model_log_path = os.path.join(
143+
seed_datasets_log_dir, subject_model_dir
144144
)
145-
for log_file in os.listdir(candidate_model_log_path):
145+
for log_file in os.listdir(subject_model_log_path):
146146
if dataset_name not in log_file:
147147
continue
148148

149-
out_dir = os.path.join(seed_capability_result_dir, candidate_model_dir)
149+
out_dir = os.path.join(seed_capability_result_dir, subject_model_dir)
150150
out_dir = os.path.join(out_dir, domain)
151151
os.makedirs(out_dir, exist_ok=True)
152152

153153
# For math dataset, extract math capability logs
154154
if "math" in log_file:
155155
extract_math_capability_logs(
156-
log_file=os.path.join(candidate_model_log_path, log_file),
156+
log_file=os.path.join(subject_model_log_path, log_file),
157157
capability_name=capability_name,
158158
subject=subject,
159159
out_dir=out_dir,
@@ -163,15 +163,15 @@ def main(cfg: DictConfig) -> None:
163163
elif "gsm8k" in log_file:
164164
# No changes to log file, just copy it to output directory
165165
shutil.copyfile(
166-
src=os.path.join(candidate_model_log_path, log_file),
166+
src=os.path.join(subject_model_log_path, log_file),
167167
dst=os.path.join(
168168
out_dir,
169169
f"{capability_name}.json",
170170
),
171171
)
172172

173173
print(
174-
f"Extracted {candidate_model_dir} result for {capability_name} capability."
174+
f"Extracted {subject_model_dir} result for {capability_name} capability."
175175
)
176176

177177

0 commit comments

Comments
 (0)