Correct typing

sergeyklay · sergeyklay · commit a520288c5f5d · 2025-03-17T09:05:11.000+01:00
diff --git a/clusx/clustering/models.py b/clusx/clustering/models.py
@@ -374,9 +374,9 @@ def _calculate_cluster_probabilities(
         scores.append(prior_new + new_cluster_likelihood)
 
         # Convert log scores to probabilities
-        scores = np.array(scores)
-        scores -= logsumexp(scores)  # type: ignore
-        probabilities = np.exp(scores)  # type: np.ndarray
+        scores_array = np.array(scores)
+        scores_array -= logsumexp(scores_array)  # type: ignore
+        probabilities = np.exp(scores_array)
 
         # Add placeholder for new cluster ID
         extended_cluster_ids = cluster_ids + [None]  # None represents new cluster
@@ -424,15 +424,14 @@ def _create_or_update_cluster(
 
         # Update existing cluster
         assert existing_cluster_id is not None
-        cid = existing_cluster_id
-        params = self.cluster_params[cid]
+        params = self.cluster_params[existing_cluster_id]
         params["count"] += 1
-        params["mean"] = self._normalize(
-            params["mean"] * (params["count"] - 1) + embedding
-        )
-        self.clusters.append(cid)
 
-        return cid
+        result = (params["mean"] * (params["count"] - 1) + embedding).astype(np.float32)
+        params["mean"] = self._normalize(result)
+        self.clusters.append(existing_cluster_id)
+
+        return existing_cluster_id
 
     def assign_cluster(self, embedding: NDArray[np.float32]) -> tuple[int, np.ndarray]:
         """
@@ -777,7 +776,7 @@ def log_pyp_prior(self, cluster_id: Optional[int] = None) -> float:
 
         # Prior for an existing cluster: (n_k - sigma) / (n + alpha)
         assert "count" in self.cluster_params[cluster_id]
-        count = self.cluster_params[cluster_id]["count"]
+        count = int(self.cluster_params[cluster_id]["count"])
         numerator = count - self.sigma
 
         # If numerator is negative or zero, use a small positive value
@@ -837,9 +836,9 @@ def _calculate_cluster_probabilities(
         scores.append(prior_new + new_cluster_likelihood)
 
         # Convert log scores to probabilities
-        scores = np.array(scores)
-        scores -= logsumexp(scores)  # type: ignore
-        probabilities = np.exp(scores)
+        scores_array = np.array(scores)
+        scores_array -= logsumexp(scores_array)  # type: ignore
+        probabilities = np.exp(scores_array)
 
         # Add placeholder for new cluster ID
         extended_cluster_ids = cluster_ids + [None]  # None represents new cluster
diff --git a/docs/source/methodological_framework.rst b/docs/source/methodological_framework.rst
@@ -12,11 +12,11 @@ This section documents the design and implementation of the nonparametric Bayesi
 Dirichlet Process Clustering
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Clusterium implements text clustering using the Dirichlet Process (DP), a fundamental nonparametric Bayesian model that allows for a flexible, potentially infinite number of clusters. Unlike traditional clustering algorithms that require pre-specifying the number of clusters (e.g., K-means), the Dirichlet Process automatically determines the appropriate number of clusters based on the data. The theoretical foundations for this approach were established by Ferguson [1]_.
+Clusterium implements text clustering using the Dirichlet Process (DP), a fundamental nonparametric Bayesian model that allows for a flexible, potentially infinite number of clusters. Unlike traditional clustering algorithms that require pre-specifying the number of clusters (e.g., K-means), the DP automatically determines the appropriate number of clusters based on the data. The theoretical foundations for this approach were established by Ferguson [1]_.
 
 **Mathematical Foundation:**
 
-In Clusterium's implementation, the Dirichlet Process is realized through the Chinese Restaurant Process (CRP) formulation. The prior probability of a document joining an existing cluster or creating a new one follows:
+In Clusterium's implementation, the DP is realized through the Chinese Restaurant Process (CRP) formulation. The prior probability of a document joining an existing cluster or creating a new one follows:
 
 .. math::
 
@@ -61,7 +61,7 @@ These properties make vMF particularly suitable for clustering in high-dimension
 
 **Algorithm Overview:**
 
-The Dirichlet Process clustering algorithm in Clusterium follows these key steps:
+The DP clustering algorithm in Clusterium follows these key steps:
 
 1. **Embedding Generation**: Transform documents into normalized vector representations using a pretrained language model.
 
@@ -91,7 +91,7 @@ Clusterium's implementation includes several important design decisions that aff
 
 **Stochastic Properties and Document Order Sensitivity:**
 
-A critical aspect of the Dirichlet Process implementation is its sequential, stochastic nature. Since documents are processed one at a time following the Chinese Restaurant Process, several important properties emerge:
+A critical aspect of the DP implementation is its sequential, stochastic nature. Since documents are processed one at a time following the Chinese Restaurant Process, several important properties emerge:
 
 1. **Order Dependency**: The final clustering outcome is sensitive to the order in which documents are processed. This sensitivity arises because:
 
@@ -116,7 +116,7 @@ To mitigate order dependency in production applications, randomly shuffling docu
 
 **Parameter Tuning:**
 
-The Dirichlet Process clustering model is governed by two key parameters that significantly influence clustering behavior from an academic perspective:
+The DP clustering model is governed by two key parameters that significantly influence clustering behavior from an academic perspective:
 
 1. **Alpha (α)**: The concentration parameter that controls cluster proliferation.
 
@@ -135,6 +135,8 @@ The interaction between these parameters creates distinct clustering profiles. F
 Pitman-Yor Process Clustering
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+Clustering using the Pitman-Yor Process (PYP) is generally better suited for text data as it can model the power-law distributions common in natural language.
+
 .. note::
 
    This section is currently under development and will be added in a future update.