Skip to content

Commit a224c5e

Browse files
authored
Merge pull request #31 from VectorInstitute/ft/cap_embeddings_and_scores
Capability score visualization
2 parents 160ca1a + c7f596d commit a224c5e

File tree

6 files changed

+483
-36
lines changed

6 files changed

+483
-36
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
scientist_llm:
2+
name: o4-mini
3+
provider: openai
4+
exp_cfg:
5+
exp_id: o4-mini_C100_R5_A10_T100
6+
# The capabilities directory should contain the LLM generated capability files.
7+
capabilities_cfg:
8+
saved_capabilities_dir: /fs01/projects/aieng/public/ace/artifacts/gcp_artifacts/capabilities_o4-mini_C100_R5_A10_T100
9+
domain: math
10+
# Method used to generate capabilities
11+
method: "hierarchical"
12+
# Number of seed capabilities to use for initial capability generation
13+
# Set to -1 to use all seed capabilities
14+
num_seed_capabilities: 1
15+
# Number of initial capabilities to generate using the scientist LLM
16+
num_gen_capabilities: 100
17+
# Buffer for capability generation
18+
num_gen_capabilities_buffer: 0.2
19+
# Number of capability areas to generate
20+
num_capability_areas: 10
21+
# Number of initial capabilities to generate per run
22+
num_gen_capabilities_per_run: 5
23+
# Number of tasks to generate for each capability
24+
num_gen_tasks_per_capability: 100
25+
# Buffer for task generation
26+
num_gen_tasks_buffer: 0.2
27+
28+
score_cfg:
29+
subject_llm_names:
30+
- claude-3-7-sonnet-20250219
31+
- o3-mini
32+
- gemini-2.0-flash
33+
- o1-mini
34+
- Meta-Llama-3.1-70B-Instruct
35+
36+
# The scores directory should contain the LLM capability scores.
37+
read_score_dir: /fs01/projects/aieng/public/ace/artifacts/gcp_artifacts/scores
38+
# The directory to save the capability scores plots.
39+
plot_capabilities_score_dir: /fs01/projects/aieng/public/ace/artifacts/gcp_artifacts/plots
40+
41+
defaults:
42+
- _self_

example_scripts/example_cfg/train_test_embedding_visualization_cfg.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# The capabilities directory should contain the LLM generated capability files.
22
capabilities_cfg:
3-
saved_capabilities_dir: /fs01/projects/aieng/public/ace/artifacts/capabilities_o3-mini_C20_R3_A5_T5
3+
saved_capabilities_dir: /fs01/projects/aieng/public/ace/capabilities_o4-mini_C100_R5_A10_T100
44
domain: math
55

66
embedding_cfg:
77
# The embedding model name used to generate capability embeddings used for filtering.
88
embedding_model: "text-embedding-3-small" # "text-embedding-3-small" or "text-embedding-3-large"
99
embedding_size: 512
1010
# The cosine similarity threshold for filtering capabilities based on their embeddings.
11-
filtering_similarity_threshold: 0.9
11+
filtering_similarity_threshold: 0.90
1212

1313
dimensionality_reduction_cfg:
1414
# Dimensionality reduction method generates the low dimensional encodings.
@@ -19,12 +19,12 @@ dimensionality_reduction_cfg:
1919
normalize_output: False
2020

2121
embedding_visualization_cfg:
22-
save_dir: /fs01/projects/aieng/public/ace/artifacts/visualizations
23-
plot_name: "Non-normalized PCA Embeddings"
24-
show_point_ids: true # Set to true when plotting a small number of capabilities.
22+
save_dir: /fs01/projects/aieng/public/acecapabilities_o4-mini_C100_R5_A10_T100/visualizations
23+
plot_name: "PCA Embeddings"
24+
show_point_ids: False # Set to true when plotting a small number of capabilities.
2525

2626
heatmap_cfg:
27-
save_dir: /fs01/projects/aieng/public/ace/artifacts/visualizations
27+
save_dir: /fs01/projects/aieng/public/ace/capabilities_o4-mini_C100_R5_A10_T100/visualizations
2828
plot_name: "embedding_heatmap"
2929
add_squares: true
3030

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import logging # noqa: D100
2+
import os # noqa: D100
3+
4+
import hydra
5+
from omegaconf import DictConfig
6+
from tqdm import tqdm
7+
8+
from src.generate_capabilities import (
9+
get_previous_capabilities,
10+
plot_capability_scores_spider_and_bar_chart,
11+
select_complete_capabilities,
12+
)
13+
from src.utils.data_utils import get_run_id
14+
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
@hydra.main(
20+
version_base=None,
21+
config_path="example_cfg",
22+
config_name="capability_score_visualization",
23+
)
24+
def main(cfg: DictConfig) -> None:
25+
"""Plot capability scores across areas for each subject LLM."""
26+
run_id = get_run_id(cfg)
27+
# Set the base capability directory
28+
capability_dir = os.path.join(
29+
cfg.capabilities_cfg.saved_capabilities_dir,
30+
cfg.capabilities_cfg.domain,
31+
)
32+
33+
# Fetch previously generated capabilities
34+
# Read the capabilities from the base directory
35+
capabilities = get_previous_capabilities(
36+
capability_dir=capability_dir,
37+
score_dir_suffix=run_id,
38+
)
39+
capabilities = sorted(capabilities, key=lambda x: x.name)
40+
logger.info(f"All capability names:\n{capabilities}")
41+
# Select complete capabilities (same set of capabilities were evaluated)
42+
capabilities = select_complete_capabilities(
43+
capabilities=capabilities,
44+
strict=False,
45+
num_tasks_lower_bound=int(
46+
cfg.capabilities_cfg.num_gen_tasks_per_capability
47+
* (1 - cfg.capabilities_cfg.num_gen_tasks_buffer)
48+
),
49+
)
50+
# Sort capabilities by name
51+
capabilities = sorted(capabilities, key=lambda x: x.name)
52+
# Pre-load capability scores
53+
for subject_llm_name in cfg.score_cfg.subject_llm_names:
54+
for capability in tqdm(capabilities, desc="Loading capability scores"):
55+
capability.load_scores(
56+
subject_llm_name=subject_llm_name,
57+
)
58+
59+
# Plot capability scores based on area --> spider and bar charts.
60+
plot_capability_scores_spider_and_bar_chart(
61+
capabilities,
62+
cfg.score_cfg.subject_llm_names,
63+
cfg.score_cfg.plot_capabilities_score_dir,
64+
plot_name="llm_scores",
65+
plot_spider_chart=True,
66+
plot_grouped_bars=True,
67+
)
68+
69+
70+
if __name__ == "__main__":
71+
main()

example_scripts/train_test_embedding_visualization.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def main(cfg: DictConfig) -> None:
3737
train_capability_dir = os.path.join(
3838
cfg.capabilities_cfg.saved_capabilities_dir,
3939
cfg.capabilities_cfg.domain,
40-
"train",
4140
)
4241

4342
# Fetch previously generated capabilities
@@ -65,8 +64,10 @@ def main(cfg: DictConfig) -> None:
6564
dim_reduction_method_name=cfg.dimensionality_reduction_cfg.reduce_dimensionality_method,
6665
output_dimension_size=cfg.dimensionality_reduction_cfg.reduced_dimensionality_size,
6766
embedding_model_name=cfg.embedding_cfg.embedding_model,
67+
tsne_perplexity=cfg.dimensionality_reduction_cfg.tsne_perplexity,
6868
normalize_output=cfg.dimensionality_reduction_cfg.normalize_output,
6969
)
70+
7071
# Visualize the reduced embeddings
7172
logger.info(
7273
f"Visualizing {len(filtered_capabilities)} train capabilities at {cfg.embedding_visualization_cfg.save_dir}"
@@ -76,12 +77,13 @@ def main(cfg: DictConfig) -> None:
7677
capabilities=filtered_capabilities,
7778
dim_reduction_method=cfg.dimensionality_reduction_cfg.reduce_dimensionality_method,
7879
save_dir=cfg.embedding_visualization_cfg.save_dir,
79-
plot_name=cfg.embedding_visualization_cfg.plot_name + " Train",
80+
plot_name=cfg.embedding_visualization_cfg.plot_name,
8081
show_point_ids=cfg.embedding_visualization_cfg.show_point_ids,
8182
)
8283
# Create and save the heatmap
8384
logger.info(
84-
f"Generating heatmap for {len(filtered_capabilities)} train capabilities at {cfg.heatmap_cfg.save_dir}"
85+
f"Generating heatmap for {len(filtered_capabilities)} train capabilities\
86+
at {cfg.heatmap_cfg.save_dir}"
8587
)
8688
generate_capability_heatmap(
8789
capabilities=filtered_capabilities,
@@ -90,6 +92,22 @@ def main(cfg: DictConfig) -> None:
9092
plot_name=cfg.heatmap_cfg.plot_name,
9193
add_squares=cfg.heatmap_cfg.add_squares,
9294
)
95+
96+
_ = apply_dimensionality_reduction(
97+
filtered_capabilities,
98+
dim_reduction_method_name="t-sne",
99+
output_dimension_size=cfg.dimensionality_reduction_cfg.reduced_dimensionality_size,
100+
embedding_model_name=cfg.embedding_cfg.embedding_model,
101+
tsne_perplexity=cfg.dimensionality_reduction_cfg.tsne_perplexity,
102+
normalize_output=cfg.dimensionality_reduction_cfg.normalize_output,
103+
)
104+
plot_hierarchical_capability_2d_embeddings(
105+
capabilities=filtered_capabilities,
106+
dim_reduction_method="t-sne",
107+
save_dir=cfg.embedding_visualization_cfg.save_dir,
108+
plot_name="t-SNE Embedding",
109+
show_point_ids=cfg.embedding_visualization_cfg.show_point_ids,
110+
)
93111
# Test capabilities
94112
# Only PCA can be used for test capabilities.
95113
if cfg.dimensionality_reduction_cfg.reduce_dimensionality_method == "pca":

src/generate_capabilities.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
filter_embeddings,
2222
hierarchical_2d_visualization,
2323
save_embedding_heatmap,
24+
visualize_llm_scores_area_grouped_bar_chart,
25+
visualize_llm_scores_spider_chart,
2426
)
2527
from src.model import Model
2628
from src.utils import constants, prompts
@@ -354,12 +356,64 @@ def generate_capabilities_using_llm(
354356
}
355357

356358

359+
def plot_capability_scores_spider_and_bar_chart(
360+
capabilities: List[Capability],
361+
subject_llm_names: List[str],
362+
save_dir: str,
363+
plot_name: str,
364+
plot_spider_chart: bool = True,
365+
plot_grouped_bars: bool = True,
366+
) -> None:
367+
"""Plot capability scores using a spider chart.
368+
369+
Args
370+
----
371+
capabilities (List[Capability]): The list of capabilities.
372+
subject_llm_names (List[str]): The names of the subject LLMs.
373+
save_dir (str): The directory to save the plot.
374+
plot_name (str): The name of the plot to save.
375+
plot_spider_chart (bool): Whether to plot a spider chart.
376+
plot_grouped_bars (bool): Whether to plot grouped bars.
377+
378+
"""
379+
# Group capabilities by area
380+
llm_scores_by_area: Dict[str, Dict[str, List[float]]] = {}
381+
# example: {"area1": {"llm1": [score1, score2], "llm2": [score3, score4]}} # noqa
382+
for capability in capabilities:
383+
if capability.area not in llm_scores_by_area:
384+
llm_scores_by_area[capability.area] = {}
385+
for llm_name in subject_llm_names:
386+
if llm_name not in llm_scores_by_area[capability.area]:
387+
llm_scores_by_area[capability.area][llm_name] = []
388+
# Append the score for the capability
389+
llm_scores_by_area[capability.area][llm_name].append(
390+
capability.scores[llm_name]["mean"]
391+
)
392+
# Take the average of the scores for each area
393+
# Example: {"area1": {"llm1": (mean1,std1), "llm2": (mean2,std2)}} # noqa
394+
avg_llm_scores_by_area: Dict[str, Dict[str, Any]] = {}
395+
for area, llm_scores in llm_scores_by_area.items():
396+
avg_llm_scores_by_area[area] = {}
397+
for llm_name, scores in llm_scores.items():
398+
avg_llm_scores_by_area[area][llm_name] = (np.mean(scores), np.std(scores))
399+
400+
if plot_spider_chart:
401+
visualize_llm_scores_spider_chart(
402+
avg_llm_scores_by_area, save_dir, f"{plot_name}_spider_chart"
403+
)
404+
if plot_grouped_bars:
405+
visualize_llm_scores_area_grouped_bar_chart(
406+
avg_llm_scores_by_area, save_dir, f"{plot_name}_bar_chart"
407+
)
408+
409+
357410
def plot_hierarchical_capability_2d_embeddings(
358411
capabilities: List[Capability],
359412
dim_reduction_method: str,
360413
plot_name: str,
361414
save_dir: str,
362415
show_point_ids: bool,
416+
save_area_legend: bool = True,
363417
) -> None:
364418
"""Visualize the hierarchical capability embeddings.
365419
@@ -374,6 +428,11 @@ def plot_hierarchical_capability_2d_embeddings(
374428
save_dir (str): The directory to save the plot.
375429
show_point_ids (bool): Whether to show point IDs in the plot. Set this to
376430
False for large datasets to avoid cluttering the plot.
431+
save_area_legend (bool): Whether to save the area legend as a separate plot.
432+
433+
Returns
434+
-------
435+
None
377436
"""
378437
# Get the reduced embeddings.
379438
reduced_embeddings = [
@@ -397,6 +456,7 @@ def plot_hierarchical_capability_2d_embeddings(
397456
save_dir=save_dir,
398457
plot_name=plot_name,
399458
points_area_name_ids=points_area_name_ids if show_point_ids else None,
459+
save_area_legend=save_area_legend,
400460
)
401461

402462

@@ -568,7 +628,8 @@ def generate_and_set_capabilities_embeddings(
568628
texts = []
569629
for capability in capabilities:
570630
capability_dict = capability.to_dict(attribute_names=["name", "description"])
571-
texts.append(f"{capability_dict['name']}: {capability_dict['description']}")
631+
rep_string = f"{capability_dict['name']} - {capability.area}: {capability_dict['description']}"
632+
texts.append(rep_string)
572633
embeddings = embedding_generator.generate_embeddings(texts)
573634
# Set embeddings for capabilities.
574635
for i, capability in enumerate(capabilities):

0 commit comments

Comments
 (0)