Add typing, and minor tweaks

sergeyklay · sergeyklay · commit ceb383fa1b7d · 2025-03-16T19:28:34.000+01:00
diff --git a/clusterium.code-workspace b/clusterium.code-workspace
@@ -6,16 +6,13 @@
 	],
 	"settings": {
 		"git.enableSmartCommit": true,
-
 		"makefile.configureOnOpen": false,
-
 		"python.analysis.typeCheckingMode": "basic",
 		"python.testing.pytestArgs": [
 			"tests"
 		],
 		"python.testing.unittestEnabled": false,
 		"python.testing.pytestEnabled": true,
-
 		"[python]": {
 			"editor.defaultFormatter": "ms-python.black-formatter",
 			"editor.formatOnSave": true,
@@ -31,7 +28,6 @@
 			"--python-version",
 			"auto"
 		],
-
 		"black-formatter.args": [
 			"--extend-exclude",
 			".poetry",
@@ -45,8 +41,11 @@
 		},
 		"isort.check": true,
 		"isort.importStrategy": "fromEnvironment",
-
-		"flake8.path": ["${interpreter}", "-m", "flake8"],
+		"flake8.path": [
+			"${interpreter}",
+			"-m",
+			"flake8"
+		],
 		"flake8.args": [
 			"--max-line-length",
 			"88",
diff --git a/clusx/clustering/models.py b/clusx/clustering/models.py
@@ -32,17 +32,17 @@
 from scipy.special import logsumexp
 from sentence_transformers import SentenceTransformer
 
+from clusx.logging import get_logger
+from clusx.utils import to_numpy
+
 if TYPE_CHECKING:
-    from typing import Any, Optional, Union
+    from typing import Optional, Union
 
     import torch
     from numpy.typing import NDArray
 
     EmbeddingTensor = Union[torch.Tensor, NDArray[np.float32]]
 
-from clusx.logging import get_logger
-from clusx.utils import to_numpy
-
 logger = get_logger(__name__)
 
 
@@ -122,8 +122,8 @@ def __init__(
         # For reproducibility
         self.random_state = np.random.default_rng(seed=random_state)
 
-        self.clusters = []
-        self.cluster_params = {}
+        self.clusters: list[int] = []
+        self.cluster_params: dict[int, dict[str, EmbeddingTensor | int]] = {}
         self.global_mean: Optional[EmbeddingTensor] = None
         self.next_id = 0
         self.embeddings_: Optional[EmbeddingTensor] = None
@@ -374,7 +374,7 @@ def _calculate_cluster_probabilities(
 
         # Convert log scores to probabilities
         scores = np.array(scores)
-        scores -= logsumexp(scores)
+        scores -= logsumexp(scores)  # type: ignore
         probabilities = np.exp(scores)  # type: np.ndarray
 
         # Add placeholder for new cluster ID
@@ -478,7 +478,7 @@ def assign_cluster(self, embedding: EmbeddingTensor) -> tuple[int, np.ndarray]:
 
         return cluster_id, probs
 
-    def fit(self, documents, _y: Union[Any, None] = None):
+    def fit(self, documents, _y=None):
         """
         Train the clustering model on the given text data.
 
@@ -490,7 +490,7 @@ def fit(self, documents, _y: Union[Any, None] = None):
         ----------
         documents : Union[list[str], list[EmbeddingTensor]]
             The text documents or embeddings to cluster.
-        _y : Union[Any, None]
+        _y
             Ignored. Added for compatibility with scikit-learn API.
 
         Returns
@@ -580,15 +580,15 @@ def predict(self, documents):
 
         return np.array(predictions)
 
-    def fit_predict(self, documents, _y: Union[Any, None] = None):
+    def fit_predict(self, documents, _y=None):
         """
         Fit the model and predict cluster labels for documents.
 
         Parameters
         ----------
         documents : Union[list[str], list[EmbeddingTensor]]
             The text documents or embeddings to cluster.
-        _y : Union[Any, None]
+        _y
             This parameter exists only for compatibility with scikit-learn API.
 
         Returns
@@ -837,7 +837,7 @@ def _calculate_cluster_probabilities(
 
         # Convert log scores to probabilities
         scores = np.array(scores)
-        scores -= logsumexp(scores)
+        scores -= logsumexp(scores)  # type: ignore
         probabilities = np.exp(scores)
 
         # Add placeholder for new cluster ID
diff --git a/clusx/clustering/utils.py b/clusx/clustering/utils.py
@@ -161,7 +161,7 @@ def save_clusters_to_json(
     """
     # Group texts by cluster
     cluster_groups = defaultdict(list)
-    for text, cluster_id in zip(texts or [], clusters or []):
+    for text, cluster_id in zip(texts, clusters):
         cluster_groups[cluster_id].append(text)
 
     clusters_json = {
diff --git a/clusx/evaluation.py b/clusx/evaluation.py
@@ -32,16 +32,17 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
-from sklearn.metrics import silhouette_score
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.neighbors import NearestNeighbors
+from sklearn.metrics import silhouette_score  # type: ignore
+from sklearn.metrics.pairwise import cosine_similarity  # type: ignore
+from sklearn.neighbors import NearestNeighbors  # type: ignore
+
+from clusx.errors import EvaluationError
+from clusx.logging import get_logger
 
 if TYPE_CHECKING:
-    import numpy  # pylint: disable=reimported
     from typing import Any, Union
 
-from .errors import EvaluationError
-from .logging import get_logger
+    import numpy  # pylint: disable=reimported
 
 logger = get_logger(__name__)
 
@@ -152,9 +153,10 @@ def __init__(
 
         # Validate inputs
         if len(texts) != len(embeddings) or len(texts) != len(cluster_assignments):
-            raise ValueError(
+            raise EvaluationError(
                 "Length mismatch: texts, embeddings, and cluster_assignments "
-                "must have the same length"
+                f"must have the same length, got {len(texts)}, {len(embeddings)}, "
+                f"and {len(cluster_assignments)} respectively",
             )
 
         logger.info(
@@ -191,7 +193,7 @@ def calculate_silhouette_score(self) -> float:
             is not possible
         """
         # Count samples per cluster
-        cluster_counts = {}
+        cluster_counts: dict[int, int] = {}
         for cluster_id in self.cluster_assignments:
             cluster_counts[cluster_id] = cluster_counts.get(cluster_id, 0) + 1
 
@@ -294,6 +296,7 @@ def calculate_similarity_metrics(
 
             # Calculate intra-cluster similarities
             intra_sims = []
+            # TODO: On a file of 170000 lines at this point we die.
             for cluster_indices in valid_clusters.values():
                 cluster_embeddings = self.embeddings[cluster_indices]
                 sim_matrix = cosine_similarity(cluster_embeddings)
@@ -379,7 +382,7 @@ def detect_powerlaw_distribution(self) -> dict[str, Any]:
         }
 
         try:
-            import powerlaw
+            import powerlaw  # type: ignore
 
             # 1. Get cluster sizes
             cluster_sizes = []
diff --git a/clusx/visualization.py b/clusx/visualization.py
@@ -15,12 +15,14 @@
 import numpy as np
 from matplotlib import colormaps
 
+from clusx.errors import VisualizationError
+from clusx.logging import get_logger
+
 if TYPE_CHECKING:
     from typing import Any
+
     from matplotlib.axes import Axes
 
-from .errors import VisualizationError
-from .logging import get_logger
 
 logger = get_logger(__name__)
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -31,7 +31,7 @@
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
-exclude_patterns = []
+exclude_patterns: list[str] = []
 
 # -- Options for nitpick -----------------------------------------------------
 
diff --git a/tests/load/profile-4.txt b/tests/load/profile-4.txt
@@ -0,0 +1,5 @@
+--dp-alpha 10 --dp-kappa 5 --pyp-alpha 5 --pyp-kappa 10 --pyp-sigma 0.5
+--dp-alpha 15 --dp-kappa 10 --pyp-alpha 12 --pyp-kappa 10 --pyp-sigma 0.7
+--dp-alpha 25 --dp-kappa 8 --pyp-alpha 20 --pyp-kappa 8 --pyp-sigma 0.8
+--dp-alpha 18 --dp-kappa 12 --pyp-alpha 15 --pyp-kappa 12 --pyp-sigma 0.7
+--dp-alpha 30 --dp-kappa 6 --pyp-alpha 20 --pyp-kappa 6 --pyp-sigma 0.9