|
6 | 6 | from typing import Any, Dict, List, Optional |
7 | 7 |
|
8 | 8 | import numpy as np |
| 9 | +import torch |
9 | 10 | from langsmith import tracing_context |
10 | 11 | from tenacity import Retrying, stop_after_attempt |
11 | 12 |
|
|
15 | 16 | EmbeddingGenerator, |
16 | 17 | EmbeddingModelName, |
17 | 18 | filter_embeddings, |
| 19 | + hierarchical_2d_visualization, |
18 | 20 | reduce_embeddings_dimensions, |
| 21 | + save_embedding_heatmap, |
19 | 22 | ) |
20 | 23 | from src.model import Model |
21 | 24 | from src.utils import constants, prompts |
@@ -145,7 +148,7 @@ def get_capability_repr_with_score(capability: Capability, model_name: str) -> s |
145 | 148 | str: A JSON string containing the capability JSON string and score. |
146 | 149 | """ |
147 | 150 | model_score = capability.load_scores()[model_name] |
148 | | - capability_dict = capability._to_dict() |
| 151 | + capability_dict = capability.to_dict() |
149 | 152 | capability_dict["score"] = model_score |
150 | 153 | return json.dumps(capability_dict, indent=4) |
151 | 154 |
|
@@ -340,12 +343,106 @@ def generate_capabilities_using_llm( |
340 | 343 | } |
341 | 344 |
|
342 | 345 |
|
| 346 | +def plot_hierarchical_capability_2d_embeddings( |
| 347 | + capabilities: List[Capability], |
| 348 | + dim_reduction_method: str, |
| 349 | + plot_name: str, |
| 350 | + save_dir: str, |
| 351 | + show_point_ids: bool, |
| 352 | +) -> None: |
| 353 | + """Visualize the hierarchical capability embeddings. |
| 354 | +
|
| 355 | + Embeddings are retrieved based on the defined dim_reduction_method, |
| 356 | + and they should be 2D. |
| 357 | +
|
| 358 | + Args |
| 359 | + ---- |
| 360 | + capabilities (List[Capability]): The list of capabilities. |
| 361 | + dim_reduction_method (str): The dimensionality reduction method to use. |
| 362 | + plot_name (str): The name of the plot to save. |
| 363 | + save_dir (str): The directory to save the plot. |
| 364 | + show_point_ids (bool): Whether to show point IDs in the plot. Set this to |
| 365 | + False for large datasets to avoid cluttering the plot. |
| 366 | +
|
| 367 | + Returns |
| 368 | + ------- |
| 369 | + None |
| 370 | + """ |
| 371 | + # Get the reduced embeddings. |
| 372 | + reduced_embeddings = [ |
| 373 | + capability.get_embedding(dim_reduction_method) for capability in capabilities |
| 374 | + ] |
| 375 | + area_names = [capability.get_attribute("area") for capability in capabilities] |
| 376 | + |
| 377 | + # Populate embeddings_by_area, and points_area_name_ids |
| 378 | + embeddings_by_area: dict[str, List[torch.Tensor]] = {} |
| 379 | + points_area_name_ids: dict[str, dict[str, int]] = {} |
| 380 | + for idx in range(len(reduced_embeddings)): |
| 381 | + area_name = area_names[idx] |
| 382 | + if area_name not in embeddings_by_area: |
| 383 | + embeddings_by_area[area_name] = [] |
| 384 | + points_area_name_ids[area_name] = {} |
| 385 | + embeddings_by_area[area_name].append(reduced_embeddings[idx]) |
| 386 | + points_area_name_ids[area_name][capabilities[idx].name] = idx |
| 387 | + |
| 388 | + hierarchical_2d_visualization( |
| 389 | + embeddings_by_area=embeddings_by_area, |
| 390 | + save_dir=save_dir, |
| 391 | + plot_name=plot_name, |
| 392 | + points_area_name_ids=points_area_name_ids if show_point_ids else None, |
| 393 | + ) |
| 394 | + |
| 395 | + |
| 396 | +def generate_capability_heatmap( |
| 397 | + capabilities: List[Capability], |
| 398 | + embedding_model_name: str, |
| 399 | + plot_name: str, |
| 400 | + save_dir: str, |
| 401 | + add_squares: bool, |
| 402 | +) -> None: |
| 403 | + """ |
| 404 | + Generate and save a heatmap of the capabilities based on their embeddings. |
| 405 | +
|
| 406 | + Args: |
| 407 | + capabilities (List[Capability]): the list of capabilities. |
| 408 | + embedding_model_name (str): name of the embedding model used |
| 409 | + to generate the embeddings. |
| 410 | + plot_name (str): name of the plot file to save. |
| 411 | + save_dir (str): directory to save the plot. |
| 412 | + add_squares (bool): whether to add squares to the heatmap. |
| 413 | + """ |
| 414 | + # Get the embeddings based on the specified embedding model name. |
| 415 | + embeddings = [ |
| 416 | + capability.get_embedding(embedding_model_name) for capability in capabilities |
| 417 | + ] |
| 418 | + # Process capabilities to populate embeddings_by_area and |
| 419 | + # capability_names_by_area. |
| 420 | + area_names = [capability.area for capability in capabilities] |
| 421 | + embeddings_by_area: dict[str, List[torch.Tensor]] = {} |
| 422 | + capability_names_by_area: dict[str, List[str]] = {} |
| 423 | + for idx in range(len(capabilities)): |
| 424 | + embedding_group = area_names[idx] |
| 425 | + if embedding_group not in embeddings_by_area: |
| 426 | + embeddings_by_area[embedding_group] = [] |
| 427 | + capability_names_by_area[embedding_group] = [] |
| 428 | + embeddings_by_area[embedding_group].append(embeddings[idx]) |
| 429 | + capability_names_by_area[embedding_group].append(capabilities[idx].name) |
| 430 | + |
| 431 | + save_embedding_heatmap( |
| 432 | + embeddings_by_area=embeddings_by_area, |
| 433 | + capability_names_by_area=capability_names_by_area, |
| 434 | + save_dir=save_dir, |
| 435 | + plot_name=plot_name, |
| 436 | + add_squares=add_squares, |
| 437 | + ) |
| 438 | + |
| 439 | + |
343 | 440 | def apply_dimensionality_reduction( |
344 | 441 | capabilities: List[Capability], |
345 | 442 | dim_reduction_method: str, |
346 | 443 | output_dimension_size: int, |
347 | 444 | embedding_model_name: str, |
348 | | - seed: int = 42, |
| 445 | + tsne_perplexity: int, |
349 | 446 | ) -> None: # noqa: D205 |
350 | 447 | """Apply dimensionality reduction to the capabilities. |
351 | 448 |
|
@@ -391,7 +488,7 @@ def apply_dimensionality_reduction( |
391 | 488 | embeddings, |
392 | 489 | output_dimensions=output_dimension_size, |
393 | 490 | dim_reduction_technique=DimensionalityReductionTechnique(dim_reduction_method), |
394 | | - seed=seed, |
| 491 | + perplexity=tsne_perplexity, |
395 | 492 | ) |
396 | 493 | # Set the reduced embeddings for each capability. |
397 | 494 | for capability, reduced_embedding in zip(capabilities, reduced_embeddings): |
@@ -425,12 +522,12 @@ def generate_and_set_capabilities_embeddings( |
425 | 522 | embed_dimensions=embed_dimensions, |
426 | 523 | ) |
427 | 524 | # Generate embeddings for the capabilities, all at the same time. |
428 | | - embeddings = embedding_generator.generate_embeddings( |
429 | | - texts=[ |
430 | | - capability.to_json_str(attribute_names=["name", "description", "domain"]) |
431 | | - for capability in capabilities |
432 | | - ] |
433 | | - ) |
| 525 | + # Embeddings are generated based on the capability name and description. |
| 526 | + texts = [] |
| 527 | + for capability in capabilities: |
| 528 | + capability_dict = capability.to_dict(attribute_names=["name", "description"]) |
| 529 | + texts.append(f"{capability_dict['name']}: {capability_dict['description']}") |
| 530 | + embeddings = embedding_generator.generate_embeddings(texts) |
434 | 531 | # Set embeddings for capabilities. |
435 | 532 | for i, capability in enumerate(capabilities): |
436 | 533 | capability.set_embedding( |
|
0 commit comments