Skip to content

Commit eff0ffc

Browse files
Merge branch 'Foysal440-Clusterability'
2 parents b7c8bf0 + 56ade52 commit eff0ffc

1 file changed

Lines changed: 130 additions & 0 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import numpy as np
2+
from typing import Dict
3+
from sklearn.neighbors import NearestNeighbors
4+
5+
from pyeyesweb.data_models.sliding_window import SlidingWindow
6+
7+
8+
class Clusterability:
9+
"""
10+
Compute clusterability metric.
11+
12+
Clusterability measures how strongly a dataset tends to form clusters rather than being randomly distributed.
13+
14+
Parameters
15+
----------
16+
n_neighbors : int
17+
Number of nearest neighbors used in the Hopkins statistic computation.
18+
19+
Notes
20+
-----
21+
22+
The Hopkins statistic is a commonly used measure of clusterability.
23+
It compares the distances of points in the dataset to their nearest neighbors with distances
24+
from uniformly distributed random points to their nearest neighbors in the dataset.
25+
26+
If points are aggregated, Clusterability approached 1, whereas a value close to 0.5 suggests randomness.
27+
28+
Read more in the [User Guide](/PyEyesWeb/user_guide/theoretical_framework/analysis_primitives/clusterability/)
29+
30+
References
31+
----------
32+
Lawson, R. G., & Jurs, P. C. (1990). New index for clustering tendency and its application to chemical problems.
33+
Journal of chemical information and computer sciences, 30(1), 36-41.
34+
"""
35+
36+
def __init__(self, n_neighbors: int) -> None:
37+
"""
38+
Initialize the Clusterability object.
39+
40+
Parameters
41+
----------
42+
n_neighbors : int
43+
Number of nearest neighbors to use in the Hopkins statistic computation.
44+
random_state : int, optional
45+
Random seed for reproducibility. Default is None.
46+
"""
47+
self.n_neighbors = n_neighbors
48+
49+
def compute_hopkins_statistic(self, data: np.ndarray) -> float:
50+
"""
51+
Compute the Hopkins statistic for a given dataset.
52+
53+
Parameters
54+
----------
55+
data : np.ndarray
56+
Input data of shape (n_samples, n_features).
57+
58+
Returns
59+
-------
60+
float
61+
Hopkins statistic value. Returns NaN if data is insufficient or invalid.
62+
"""
63+
if data.shape[0] < 5:
64+
return np.nan
65+
66+
# Generate uniform random sample within data bounds
67+
mins = np.min(data, axis=0)
68+
maxs = np.max(data, axis=0)
69+
70+
# Uniform random sample
71+
uniform_sample = np.random.uniform(mins, maxs, size=data.shape)
72+
73+
# Compute nearest neighbor distances
74+
n_neighbors = min(data.shape[0], self.n_neighbors)
75+
neighbors = NearestNeighbors(n_neighbors=n_neighbors).fit(data)
76+
77+
# Distances from data points to their nearest neighbors
78+
data_distances, _ = neighbors.kneighbors(data)
79+
u = np.sum(data_distances[:, 1]) # exclude self-distance (0)
80+
81+
# Distances from uniform sample points to their nearest neighbors
82+
uniform_distances, _ = neighbors.kneighbors(uniform_sample)
83+
w = np.sum(uniform_distances[:, 0])
84+
85+
hopkins_stat = w / (u + w + 1e-10)
86+
return float(hopkins_stat)
87+
88+
def compute_clusterability(self, signals: SlidingWindow) -> Dict[str, float]:
89+
"""
90+
Compute the clusterability of a sliding window of signals using the Hopkins statistic.
91+
92+
Parameters
93+
----------
94+
signals : SlidingWindow
95+
A sliding window object containing signal data.
96+
97+
Returns
98+
-------
99+
dict
100+
Dictionary containing:
101+
- 'hopkins_statistic' (float): Computed Hopkins statistic.
102+
Returns NaN if the window is not full or computation fails.
103+
"""
104+
if not signals.is_full():
105+
return {"hopkins_statistic": np.nan}
106+
107+
try:
108+
data, _ = signals.to_array()
109+
hopkins_value = self.compute_hopkins_statistic(data)
110+
except Exception:
111+
# TODO: add logging for better traceability
112+
hopkins_value = np.nan
113+
114+
return {"hopkins_statistic": hopkins_value}
115+
116+
def __call__(self, sliding_window: SlidingWindow) -> Dict[str, float]:
117+
"""
118+
Callable interface to compute clusterability directly on a SlidingWindow instance.
119+
120+
Parameters
121+
----------
122+
sliding_window : SlidingWindow
123+
The sliding window object containing the data.
124+
125+
Returns
126+
-------
127+
dict
128+
Output of `compute_clusterability`.
129+
"""
130+
return self.compute_clusterability(sliding_window)

0 commit comments

Comments
 (0)