@@ -466,12 +466,25 @@ def find_outliers(self, n_neighbors: int = 5) -> dict[str, float]:
466466 logger .error ("Error detecting outliers: %s" , err )
467467 return {}
468468
469- def _get_cluster_sizes (self ) -> dict [str , int ]:
469+ def calculate_cluster_size_distribution (self ) -> dict [str , int ]:
470470 """
471- Get the size distribution of clusters.
471+ Calculate the distribution of cluster sizes across all clusters.
472+
473+ This method counts the number of texts assigned to each cluster and returns
474+ a mapping of cluster IDs to their respective sizes. The distribution is useful
475+ for:
476+
477+ - Analyzing the balance of cluster assignments
478+ - Identifying dominant vs. minor clusters
479+ - Providing input for power-law distribution analysis
480+ - Visualizing the cluster size distribution
481+
482+ The cluster IDs are converted to strings in the returned dictionary to ensure
483+ compatibility with JSON serialization.
472484
473485 Returns:
474- dict[str, int]: Dictionary mapping cluster IDs to their sizes
486+ dict[str, int]: Dictionary mapping cluster IDs (as strings) to their sizes,
487+ where size represents the number of texts in each cluster
475488 """
476489 cluster_sizes = {}
477490 for cluster_id in self .unique_clusters :
@@ -490,6 +503,7 @@ def generate_report(self) -> "dict[str, Any]":
490503 similarity_metrics = self .calculate_similarity_metrics ()
491504 powerlaw_metrics = self .detect_powerlaw_distribution ()
492505 outliers = self .find_outliers ()
506+ cluster_sizes = self .calculate_cluster_size_distribution ()
493507
494508 # Compile the report
495509 report = {
@@ -503,7 +517,7 @@ def generate_report(self) -> "dict[str, Any]":
503517 "cluster_stats" : {
504518 "num_clusters" : len (self .unique_clusters ),
505519 "num_texts" : len (self .texts ),
506- "cluster_sizes" : self . _get_cluster_sizes () ,
520+ "cluster_sizes" : cluster_sizes ,
507521 },
508522 "metrics" : {
509523 "silhouette_score" : silhouette ,
0 commit comments