Skip to content

Commit b393b97

Browse files
authored
fix gsm8k bug & add personal-finance seed config (#34)
* refactor visualization. * fixed relative path. * moved functions to utilities, fixed relative paths. * removed extra print in run_capability_generation.py. updated README. * exposing functions. * fixing imports. * added default value for keys. * simplify GitHub Actions and fix Codecov issues * ci: fix workflows - remove retry wrappers and fix poetry install * ci: fix poetry for GitHub Actions. * ci: make security audit non-blocking, test: fix pytest warnings and make coverage non-blocking * Remove redundant docs workflow files * Personal finance basics config added. Seed generation code updated. * add constants needed for personal finance.
1 parent 216bcdd commit b393b97

File tree

5 files changed

+87
-65
lines changed

5 files changed

+87
-65
lines changed
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
personal_finance_basic:
2+
name: personal_finance_basic
3+
description: This capability contains synthetic multi-step personal finance problems that cover budgeting, compounding, debt strategies, insurance trade-offs, expected-value analysis, and more, requiring detailed numerical reasoning and using advanced finance concepts.
4+
domain: personal_finance
5+
data_args:
6+
source: kohankhaki/personal_finance_question_answer_complex
7+
split: train
8+
num_repr_tasks: 3
9+
instructions: >-
10+
f"""Solve the following personal finance problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the final numerical answer to the problem.\n\nProblem: {problem}\n\nShow every calculation, explain any rounding, and put the final answer on its own line as 'ANSWER: $ANSWER'."""
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
defaults:
2+
- capability_cfgs:
3+
- personal_finance_basic

src/cfg/run_cfg.yaml

Lines changed: 28 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
scientist_llm:
2-
name: o4-mini
2+
name: o3-mini
33
provider: openai
44
generation_cfg:
55
capability_generation:
@@ -59,85 +59,51 @@ prompt_cfg:
5959
sys_msg: Complete the given task to the best of your ability.
6060

6161
capabilities_cfg:
62-
capabilities_dir: /fs01/projects/aieng/public/ace/artifacts
63-
results_dir: gs://ace-artifacts
64-
inspect_evals_dir: /fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals
65-
domain: math
66-
# Method used to generate capabilities
62+
capabilities_dir: /h/fkohankh/automated_capability_evaluation_logs
63+
results_dir: /h/fkohankh/automated_capability_evaluation_logs/results
64+
inspect_evals_dir: /h/fkohankh/automated_capability_evaluation_logs/inspect_evals
65+
domain: personal_finance
6766
method: "hierarchical"
68-
# Number of seed capabilities to use for initial capability generation
69-
# Set to -1 to use all seed capabilities
7067
num_seed_capabilities: 1
71-
# Number of initial capabilities to generate using the scientist LLM
72-
num_gen_capabilities: 100
73-
# Buffer for capability generation
74-
num_gen_capabilities_buffer: 0.2
75-
# Number of capability areas to generate
76-
num_capability_areas: 10
77-
# Number of initial capabilities to generate per run
78-
num_gen_capabilities_per_run: 5
79-
# Number of tasks to generate for each capability
80-
num_gen_tasks_per_capability: 100
81-
# Buffer for task generation
82-
num_gen_tasks_buffer: 0.2
83-
# Set this flag to true to use representative tasks
84-
# as few shot examples for task generation
85-
task_gen_few_shot: true
86-
# Set this flag to true to use the specific version
87-
# of task generation prompt
68+
num_gen_capabilities: 20
69+
num_gen_capabilities_buffer: 0.0
70+
num_capability_areas: 5
71+
num_gen_capabilities_per_run: 1
72+
num_gen_tasks_per_capability: 1
73+
num_gen_tasks_buffer: 0.0
74+
task_gen_few_shot: false
8875
task_gen_prompt_version: "v1"
89-
# Number of tasks to evaluate for each capability
90-
# Set to -1 to evaluate all tasks
91-
num_eval_tasks_per_capability: -1
92-
# Number of retries for each run of capability generation
93-
capabilities_gen_retry_attempts: 3
76+
num_eval_tasks_per_capability: 2
77+
capabilities_gen_retry_attempts: 5
9478
tasks_gen_retry_attempts: 3
95-
# Concurrency for task solving and verification
96-
concurrency_task_solver: 50
97-
concurrency_task_verifier: 50
98-
concurrency_task_eval: 50
99-
# Inspect evals config
79+
concurrency_task_solver: 2
80+
concurrency_task_verifier: 2
81+
concurrency_task_eval: 2
10082
inspect_eval_log_level: "info"
10183

10284
lbo_cfg:
103-
# Number of capabilities to generate/select using LBO
104-
num_lbo_runs: 20
105-
# Type of LBO pipeline to use
106-
pipeline_id: "discover_new_lbo_knn" # "no_discovery", "discover_new_llm" or "discover_new_lbo_knn"
107-
# Train args for "no_discovery" pipeline
85+
num_lbo_runs: 2
86+
pipeline_id: "no_discovery"
10887
train_frac: 0.5
109-
num_initial_train: 10
110-
# Acquisition function that guides selecting the next query point.
111-
# "variance" and "expected_variance_reduction" is supported.
112-
acquisition_function: "expected_variance_reduction"
113-
# Value of k for "discover_new_lbo_knn" pipeline
114-
select_k: 5
115-
discover_new_llm_retry_attempts: 3
116-
discover_new_lbo_knn_retry_attempts: 5
117-
discover_new_retry_attempts: 3
88+
num_initial_train: 2
89+
acquisition_function: "variance"
11890

11991
embedding_cfg:
120-
# The embedding model name used to generate capability embeddings used for filtering.
121-
embedding_model: "text-embedding-3-small" # "text-embedding-3-small" or "text-embedding-3-large"
122-
embedding_size: 512
123-
# The cosine similarity threshold for filtering capabilities based on their embeddings.
124-
filtering_similarity_threshold: 0.85
92+
embedding_model: "text-embedding-3-small"
93+
embedding_size: 256
94+
filtering_similarity_threshold: 0.7
12595

12696
dimensionality_reduction_cfg:
127-
# dimensionality reduction method generates the low dimensional encodings.
128-
reduce_dimensionality_method: "pca" # "t-sne", "cut-embedding" or "pca".
97+
reduce_dimensionality_method: "pca"
12998
reduced_dimensionality_size: 2
130-
no_discovery_reduced_dimensionality_method: "t-sne"
99+
no_discovery_reduced_dimensionality_method: "pca"
131100
no_discovery_reduced_dimensionality_size: 2
132-
discover_new_reduced_dimensionality_method: "pca"
133-
discover_new_reduced_dimensionality_size: 2
134101

135102
exp_cfg:
136-
# Set this flag to true to run test experiments during development
137103
seed: 37
138104
trial_run: false
139-
exp_id:
105+
exp_id: "personal_finance_o3-mini_r1"
140106

141107
defaults:
142108
- _self_
143-
- capabilities: math
109+
- capabilities: personal_finance

src/create_seed_capabilities.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def main(cfg: DictConfig) -> None:
285285
# Only keep problem and answer
286286
capability_repr_tasks = [
287287
{"problem": s["problem"], "answer": s["answer"]}
288-
for s in tasks[: dataset._cfg["data_args"]["num_repr_tasks"]]
288+
for s in gsm_tasks[: dataset._cfg["data_args"]["num_repr_tasks"]]
289289
]
290290

291291
populate_seed_capability_dir(
@@ -302,6 +302,47 @@ def main(cfg: DictConfig) -> None:
302302
logger.info(
303303
f"Created capability {capability_name} with {len(gsm_tasks)} tasks."
304304
)
305+
elif dataset.name == "personal_finance_basic":
306+
capability_name = "personal_finance_basic"
307+
308+
# Reformat raw data
309+
pf_tasks = []
310+
for task_id, task in enumerate(dataset._data):
311+
task["id"] = str(task_id + 1)
312+
task["problem"] = task.pop("problem")
313+
task["answer"] = task.pop("solution")
314+
task["solution"] = task.pop("reasoning")
315+
task.pop("area")
316+
task.pop("capability")
317+
pf_tasks.append(task)
318+
319+
# Prepare instructions
320+
capability_instructions = dataset.instructions.format(
321+
problem='{t["problem"]}'
322+
)
323+
324+
# Create representative tasks
325+
capability_repr_tasks = [
326+
{"problem": t["problem"], "answer": t["answer"]}
327+
for t in pf_tasks[: dataset._cfg["data_args"]["num_repr_tasks"]]
328+
]
329+
330+
populate_seed_capability_dir(
331+
base_dir=seed_capability_dir,
332+
capability_name=capability_name,
333+
capability_description=dataset.description,
334+
capability_domain=dataset.domain,
335+
capability_data=pf_tasks,
336+
capability_repr_tasks=capability_repr_tasks,
337+
capability_instructions=capability_instructions,
338+
capability_score_func=constants.PERSONAL_FINANCE_BASIC_SCORE_FUNC.strip(
339+
"\n"
340+
),
341+
source_dataset=dataset.name,
342+
)
343+
logger.info(
344+
f"Created capability {capability_name} with {len(pf_tasks)} tasks."
345+
)
305346

306347

307348
if __name__ == "__main__":

src/utils/constants.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
from enum import Enum
55

66

7-
BASE_ARTIFACTS_DIR = "/fs01/projects/aieng/public/ace/artifacts"
7+
BASE_ARTIFACTS_DIR = "/h/fkohankh/automated_capability_evaluation_logs"
88
GCP_BASE_ARTIFACTS_DIR = "gs://ace-artifacts"
9-
BASE_INSPECT_EVALS_DIR = "/fs01/projects/aieng/public/ace/inspect_evals/src/ace_evals"
9+
BASE_INSPECT_EVALS_DIR = "/h/fkohankh/automated_capability_evaluation_logs"
1010

1111
SEED_CAPABILITIES_SCORE_DIR = os.path.join(
1212
GCP_BASE_ARTIFACTS_DIR, "seed_capabilities_results"
@@ -24,10 +24,12 @@
2424
MATHEMATICS_SCORE_FUNC = f"""def score(t: dict, submission: str) -> float | None:\n{TAB_W_SPACES}{TAB_W_SPACES}from .utils import parse_submission, evaluate_with_llm_judge\n{TAB_W_SPACES}{TAB_W_SPACES}answer = parse_submission(submission)\n{TAB_W_SPACES}{TAB_W_SPACES}correct = evaluate_with_llm_judge(answer, t["answer"])\n{TAB_W_SPACES}{TAB_W_SPACES}return 1.0 if correct else 0.0"""
2525
# Score function is based on https://github.com/UKGovernmentBEIS/inspect_evals/blob/main/src/inspect_evals/mathematics/utils.py#L57
2626
GSM8K_SCORE_FUNC = f"""def score(t: dict, submission: str) -> float | None:\n{TAB_W_SPACES}{TAB_W_SPACES}return 1.0 if submission==t["answer"] else 0.0"""
27+
PERSONAL_FINANCE_BASIC_SCORE_FUNC = f"""def score(t: dict, submission: str) -> float | None:\n{TAB_W_SPACES}{TAB_W_SPACES}from .utils import parse_submission, evaluate_with_llm_judge\n{TAB_W_SPACES}{TAB_W_SPACES}answer = parse_submission(submission)\n{TAB_W_SPACES}{TAB_W_SPACES}correct = evaluate_with_llm_judge(answer, t["answer"])\n{TAB_W_SPACES}{TAB_W_SPACES}return 1.0 if correct else 0.0"""
2728

2829
DATASET_NAME_MAP = {
2930
"mathematics": "competition_math",
3031
"gsm8k": "word_problems",
32+
"personal_finance_basic": "personal_finance_basic",
3133
}
3234

3335

0 commit comments

Comments
 (0)