Skip to content

Commit 6f7288c

Browse files
authored
Merge pull request #17 from VectorInstitute/ft/test-capability-embeddings
t-SNE visualization scripts for embedding validation
2 parents 9480971 + 953b119 commit 6f7288c

14 files changed

+826
-70
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ __pycache__/
44
*$py.class
55
.vscode/
66

7+
# macOS system files
8+
.DS_Store
9+
710
# C extensions
811
*.so
912

example_scripts/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""
2+
The __init__.py file for example scripts.
3+
4+
It initializes the example scripts module.
5+
"""
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# The capabilities directory should contain the LLM generated capability files.
2+
capabilities_cfg:
3+
saved_capabilities_dir: /fs01/projects/aieng/public/ace/artifacts/capabilities_gpt-4o-mini_T20_R4_A5
4+
domain: math
5+
6+
embedding_cfg:
7+
# The embedding model name used to generate capability embeddings used for filtering.
8+
embedding_model: "text-embedding-3-small" # "text-embedding-3-small" or "text-embedding-3-large"
9+
embedding_size: 512
10+
# The cosine similarity threshold for filtering capabilities based on their embeddings.
11+
filtering_similarity_threshold: 0.85
12+
13+
dimensionality_reduction_cfg:
14+
# Dimensionality reduction method generates the low dimensional encodings.
15+
reduce_dimensionality_method: "t-sne" # "t-sne" or "cut-embedding".
16+
reduced_dimensionality_size: 2
17+
tsne_perplexity: 8 # Choose this hyperparameter based on the number of capabilities you have.
18+
19+
embedding_visualization_cfg:
20+
save_dir: /fs01/projects/aieng/public/ace/artifacts/visualizations
21+
plot_name: "tsne_plot"
22+
show_point_ids: true # Set to true when plotting a small number of capabilities.
23+
24+
heatmap_cfg:
25+
save_dir: /fs01/projects/aieng/public/ace/artifacts/visualizations
26+
plot_name: "heatmap"
27+
add_squares: true
28+
29+
defaults:
30+
- _self_
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""The visualize_embeddings.py script is used to visualize LLM generated capabilities.
2+
3+
It generates a 2D plot of the capabilities' embeddings using the specified
4+
dimensionality reduction method. It also generates a heatmap of the capabilities
5+
similarity matrix. Similarity is computed using the original embeddings
6+
generated with the embedding model defined in the config file.
7+
"""
8+
9+
import os # noqa: D100
10+
11+
import hydra
12+
from omegaconf import DictConfig
13+
14+
from src.generate_capabilities import (
15+
apply_dimensionality_reduction,
16+
filter_capabilities,
17+
generate_and_set_capabilities_embeddings,
18+
generate_capability_heatmap,
19+
get_previous_capabilities,
20+
plot_hierarchical_capability_2d_embeddings,
21+
)
22+
23+
24+
@hydra.main(
25+
version_base=None, config_path="example_cfg", config_name="visualization_cfg"
26+
)
27+
def main(cfg: DictConfig) -> None:
28+
"""
29+
Run the visualization script with the specified configuration.
30+
31+
Args:
32+
cfg (DictConfig): Configuration for the script.
33+
"""
34+
# Load capabilities from the specified directory
35+
# Set the base capability directory
36+
base_capability_dir = os.path.join(
37+
cfg.capabilities_cfg.saved_capabilities_dir,
38+
cfg.capabilities_cfg.domain,
39+
)
40+
os.makedirs(base_capability_dir, exist_ok=True)
41+
42+
# Fetch previously generated capabilities
43+
capabilities = get_previous_capabilities(capability_dir=base_capability_dir)
44+
# Assert that the capabilities list is not empty
45+
print(f"Loaded {len(capabilities)} capabilities from {base_capability_dir}")
46+
assert capabilities, "No capabilities found in the specified directory."
47+
48+
# Embed capabilities using openai embedding model
49+
generate_and_set_capabilities_embeddings(
50+
capabilities=capabilities,
51+
embedding_model_name=cfg.embedding_cfg.embedding_model,
52+
embed_dimensions=cfg.embedding_cfg.embedding_size,
53+
)
54+
# Filter capabilities based on their embeddings
55+
filtered_capabilities = filter_capabilities(
56+
capabilities,
57+
embedding_model_name=cfg.embedding_cfg.embedding_model,
58+
similarity_threshold=cfg.embedding_cfg.filtering_similarity_threshold,
59+
)
60+
# Reduce the dimensionality of capability embeddings generated by the
61+
# embedding model.
62+
apply_dimensionality_reduction(
63+
filtered_capabilities,
64+
dim_reduction_method=cfg.dimensionality_reduction_cfg.reduce_dimensionality_method,
65+
output_dimension_size=cfg.dimensionality_reduction_cfg.reduced_dimensionality_size,
66+
embedding_model_name=cfg.embedding_cfg.embedding_model,
67+
tsne_perplexity=cfg.dimensionality_reduction_cfg.tsne_perplexity,
68+
)
69+
# Visualize the reduced embeddings
70+
print(
71+
f"Visualizing {len(filtered_capabilities)} capabilities at {cfg.embedding_visualization_cfg.save_dir}"
72+
)
73+
plot_hierarchical_capability_2d_embeddings(
74+
capabilities=filtered_capabilities,
75+
dim_reduction_method=cfg.dimensionality_reduction_cfg.reduce_dimensionality_method,
76+
save_dir=cfg.embedding_visualization_cfg.save_dir,
77+
plot_name=cfg.embedding_visualization_cfg.plot_name,
78+
show_point_ids=cfg.embedding_visualization_cfg.show_point_ids,
79+
)
80+
# Create and save the heatmap
81+
print(
82+
f"Generating heatmap for {len(filtered_capabilities)} capabilities at {cfg.heatmap_cfg.save_dir}"
83+
)
84+
generate_capability_heatmap(
85+
capabilities=filtered_capabilities,
86+
embedding_model_name=cfg.embedding_cfg.embedding_model, # Using the original embeddings, not the reduced version.
87+
save_dir=cfg.heatmap_cfg.save_dir,
88+
plot_name=cfg.heatmap_cfg.plot_name,
89+
add_squares=cfg.heatmap_cfg.add_squares,
90+
)
91+
92+
93+
if __name__ == "__main__":
94+
main()

src/capability.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ class Capability:
127127
Loads the capability configuration from a JSON file.
128128
_load_capability_repr_class() -> None
129129
Loads the capability representation class from a Python file.
130-
_to_dict() -> Dict[str, Any]
130+
to_dict() -> Dict[str, Any]
131131
Converts the capability attributes to a dictionary.
132132
to_json_str() -> str
133133
Converts the capability to a JSON string.
@@ -509,7 +509,20 @@ def add_and_update_tasks(
509509
self._load_capability_json()
510510
self._load_capability_repr_class()
511511

512-
def _to_dict(self, attribute_names: List[str] | None = None) -> Dict[str, Any]:
512+
def to_dict(self, attribute_names: List[str] | None = None) -> Dict[str, Any]:
513+
"""
514+
Return a dictionary of the capability attributes.
515+
516+
Args:
517+
attribute_names (List[str] | None, optional): the list of attribute
518+
names requested. If none, return a set of default attributes.
519+
Defaults to None.
520+
521+
Returns
522+
-------
523+
Dict[str, Any]: a dictionary representation of the capability
524+
based on the requested attribute names or a default set of attributes.
525+
"""
513526
if attribute_names is None:
514527
return {
515528
"name": self.name,
@@ -521,6 +534,22 @@ def _to_dict(self, attribute_names: List[str] | None = None) -> Dict[str, Any]:
521534
attr: getattr(self, attr) for attr in attribute_names if hasattr(self, attr)
522535
}
523536

537+
def get_attribute(self, attribute_name: str) -> Any:
538+
"""
539+
Get the value of a specific attribute of the capability.
540+
541+
Args
542+
----
543+
attribute_name (str): The name of the attribute to retrieve.
544+
545+
Returns
546+
-------
547+
Any: The value of the specified attribute.
548+
"""
549+
if not hasattr(self, attribute_name):
550+
raise AttributeError(f"Attribute {attribute_name} not found in capability.")
551+
return getattr(self, attribute_name)
552+
524553
def to_json_str(self, attribute_names: List[str] | None = None) -> str:
525554
"""
526555
Convert the capability to a JSON string.
@@ -538,7 +567,7 @@ def to_json_str(self, attribute_names: List[str] | None = None) -> str:
538567
# If only the name is requested, return the name directly
539568
repr_str = self.name
540569
else:
541-
repr_str = json.dumps(self._to_dict(attribute_names), indent=4)
570+
repr_str = json.dumps(self.to_dict(attribute_names), indent=4)
542571
return str(repr_str)
543572

544573
def __str__(self) -> str:

src/generate_capabilities.py

Lines changed: 106 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any, Dict, List, Optional
77

88
import numpy as np
9+
import torch
910
from langsmith import tracing_context
1011
from tenacity import Retrying, stop_after_attempt
1112

@@ -15,7 +16,9 @@
1516
EmbeddingGenerator,
1617
EmbeddingModelName,
1718
filter_embeddings,
19+
hierarchical_2d_visualization,
1820
reduce_embeddings_dimensions,
21+
save_embedding_heatmap,
1922
)
2023
from src.model import Model
2124
from src.utils import constants, prompts
@@ -145,7 +148,7 @@ def get_capability_repr_with_score(capability: Capability, model_name: str) -> s
145148
str: A JSON string containing the capability JSON string and score.
146149
"""
147150
model_score = capability.load_scores()[model_name]
148-
capability_dict = capability._to_dict()
151+
capability_dict = capability.to_dict()
149152
capability_dict["score"] = model_score
150153
return json.dumps(capability_dict, indent=4)
151154

@@ -340,12 +343,106 @@ def generate_capabilities_using_llm(
340343
}
341344

342345

346+
def plot_hierarchical_capability_2d_embeddings(
347+
capabilities: List[Capability],
348+
dim_reduction_method: str,
349+
plot_name: str,
350+
save_dir: str,
351+
show_point_ids: bool,
352+
) -> None:
353+
"""Visualize the hierarchical capability embeddings.
354+
355+
Embeddings are retrieved based on the defined dim_reduction_method,
356+
and they should be 2D.
357+
358+
Args
359+
----
360+
capabilities (List[Capability]): The list of capabilities.
361+
dim_reduction_method (str): The dimensionality reduction method to use.
362+
plot_name (str): The name of the plot to save.
363+
save_dir (str): The directory to save the plot.
364+
show_point_ids (bool): Whether to show point IDs in the plot. Set this to
365+
False for large datasets to avoid cluttering the plot.
366+
367+
Returns
368+
-------
369+
None
370+
"""
371+
# Get the reduced embeddings.
372+
reduced_embeddings = [
373+
capability.get_embedding(dim_reduction_method) for capability in capabilities
374+
]
375+
area_names = [capability.get_attribute("area") for capability in capabilities]
376+
377+
# Populate embeddings_by_area, and points_area_name_ids
378+
embeddings_by_area: dict[str, List[torch.Tensor]] = {}
379+
points_area_name_ids: dict[str, dict[str, int]] = {}
380+
for idx in range(len(reduced_embeddings)):
381+
area_name = area_names[idx]
382+
if area_name not in embeddings_by_area:
383+
embeddings_by_area[area_name] = []
384+
points_area_name_ids[area_name] = {}
385+
embeddings_by_area[area_name].append(reduced_embeddings[idx])
386+
points_area_name_ids[area_name][capabilities[idx].name] = idx
387+
388+
hierarchical_2d_visualization(
389+
embeddings_by_area=embeddings_by_area,
390+
save_dir=save_dir,
391+
plot_name=plot_name,
392+
points_area_name_ids=points_area_name_ids if show_point_ids else None,
393+
)
394+
395+
396+
def generate_capability_heatmap(
397+
capabilities: List[Capability],
398+
embedding_model_name: str,
399+
plot_name: str,
400+
save_dir: str,
401+
add_squares: bool,
402+
) -> None:
403+
"""
404+
Generate and save a heatmap of the capabilities based on their embeddings.
405+
406+
Args:
407+
capabilities (List[Capability]): the list of capabilities.
408+
embedding_model_name (str): name of the embedding model used
409+
to generate the embeddings.
410+
plot_name (str): name of the plot file to save.
411+
save_dir (str): directory to save the plot.
412+
add_squares (bool): whether to add squares to the heatmap.
413+
"""
414+
# Get the embeddings based on the specified embedding model name.
415+
embeddings = [
416+
capability.get_embedding(embedding_model_name) for capability in capabilities
417+
]
418+
# Process capabilities to populate embeddings_by_area and
419+
# capability_names_by_area.
420+
area_names = [capability.area for capability in capabilities]
421+
embeddings_by_area: dict[str, List[torch.Tensor]] = {}
422+
capability_names_by_area: dict[str, List[str]] = {}
423+
for idx in range(len(capabilities)):
424+
embedding_group = area_names[idx]
425+
if embedding_group not in embeddings_by_area:
426+
embeddings_by_area[embedding_group] = []
427+
capability_names_by_area[embedding_group] = []
428+
embeddings_by_area[embedding_group].append(embeddings[idx])
429+
capability_names_by_area[embedding_group].append(capabilities[idx].name)
430+
431+
save_embedding_heatmap(
432+
embeddings_by_area=embeddings_by_area,
433+
capability_names_by_area=capability_names_by_area,
434+
save_dir=save_dir,
435+
plot_name=plot_name,
436+
add_squares=add_squares,
437+
)
438+
439+
343440
def apply_dimensionality_reduction(
344441
capabilities: List[Capability],
345442
dim_reduction_method: str,
346443
output_dimension_size: int,
347444
embedding_model_name: str,
348-
seed: int = 42,
445+
tsne_perplexity: int,
349446
) -> None: # noqa: D205
350447
"""Apply dimensionality reduction to the capabilities.
351448
@@ -391,7 +488,7 @@ def apply_dimensionality_reduction(
391488
embeddings,
392489
output_dimensions=output_dimension_size,
393490
dim_reduction_technique=DimensionalityReductionTechnique(dim_reduction_method),
394-
seed=seed,
491+
perplexity=tsne_perplexity,
395492
)
396493
# Set the reduced embeddings for each capability.
397494
for capability, reduced_embedding in zip(capabilities, reduced_embeddings):
@@ -425,12 +522,12 @@ def generate_and_set_capabilities_embeddings(
425522
embed_dimensions=embed_dimensions,
426523
)
427524
# Generate embeddings for the capabilities, all at the same time.
428-
embeddings = embedding_generator.generate_embeddings(
429-
texts=[
430-
capability.to_json_str(attribute_names=["name", "description", "domain"])
431-
for capability in capabilities
432-
]
433-
)
525+
# Embeddings are generated based on the capability name and description.
526+
texts = []
527+
for capability in capabilities:
528+
capability_dict = capability.to_dict(attribute_names=["name", "description"])
529+
texts.append(f"{capability_dict['name']}: {capability_dict['description']}")
530+
embeddings = embedding_generator.generate_embeddings(texts)
434531
# Set embeddings for capabilities.
435532
for i, capability in enumerate(capabilities):
436533
capability.set_embedding(

0 commit comments

Comments
 (0)