Skip to content
This repository was archived by the owner on Jan 8, 2026. It is now read-only.

Commit be5bbad

Browse files
committed
Rename _get_cluster_sizes to calculate_cluster_size_distribution
1 parent efd6a14 commit be5bbad

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

clusx/evaluation.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -466,12 +466,25 @@ def find_outliers(self, n_neighbors: int = 5) -> dict[str, float]:
466466
logger.error("Error detecting outliers: %s", err)
467467
return {}
468468

469-
def _get_cluster_sizes(self) -> dict[str, int]:
469+
def calculate_cluster_size_distribution(self) -> dict[str, int]:
470470
"""
471-
Get the size distribution of clusters.
471+
Calculate the distribution of cluster sizes across all clusters.
472+
473+
This method counts the number of texts assigned to each cluster and returns
474+
a mapping of cluster IDs to their respective sizes. The distribution is useful
475+
for:
476+
477+
- Analyzing the balance of cluster assignments
478+
- Identifying dominant vs. minor clusters
479+
- Providing input for power-law distribution analysis
480+
- Visualizing the cluster size distribution
481+
482+
The cluster IDs are converted to strings in the returned dictionary to ensure
483+
compatibility with JSON serialization.
472484
473485
Returns:
474-
dict[str, int]: Dictionary mapping cluster IDs to their sizes
486+
dict[str, int]: Dictionary mapping cluster IDs (as strings) to their sizes,
487+
where size represents the number of texts in each cluster
475488
"""
476489
cluster_sizes = {}
477490
for cluster_id in self.unique_clusters:
@@ -490,6 +503,7 @@ def generate_report(self) -> "dict[str, Any]":
490503
similarity_metrics = self.calculate_similarity_metrics()
491504
powerlaw_metrics = self.detect_powerlaw_distribution()
492505
outliers = self.find_outliers()
506+
cluster_sizes = self.calculate_cluster_size_distribution()
493507

494508
# Compile the report
495509
report = {
@@ -503,7 +517,7 @@ def generate_report(self) -> "dict[str, Any]":
503517
"cluster_stats": {
504518
"num_clusters": len(self.unique_clusters),
505519
"num_texts": len(self.texts),
506-
"cluster_sizes": self._get_cluster_sizes(),
520+
"cluster_sizes": cluster_sizes,
507521
},
508522
"metrics": {
509523
"silhouette_score": silhouette,

0 commit comments

Comments
 (0)