|
| 1 | +import numpy as np |
| 2 | +from typing import Dict |
| 3 | +from sklearn.neighbors import NearestNeighbors |
| 4 | + |
| 5 | +from pyeyesweb.data_models.sliding_window import SlidingWindow |
| 6 | + |
| 7 | + |
| 8 | +class Clusterability: |
| 9 | + """ |
| 10 | + Compute clusterability metric. |
| 11 | +
|
| 12 | + Clusterability measures how strongly a dataset tends to form clusters rather than being randomly distributed. |
| 13 | +
|
| 14 | + Parameters |
| 15 | + ---------- |
| 16 | + n_neighbors : int |
| 17 | + Number of nearest neighbors used in the Hopkins statistic computation. |
| 18 | +
|
| 19 | + Notes |
| 20 | + ----- |
| 21 | +
|
| 22 | + The Hopkins statistic is a commonly used measure of clusterability. |
| 23 | + It compares the distances of points in the dataset to their nearest neighbors with distances |
| 24 | + from uniformly distributed random points to their nearest neighbors in the dataset. |
| 25 | +
|
| 26 | + If points are aggregated, Clusterability approached 1, whereas a value close to 0.5 suggests randomness. |
| 27 | +
|
| 28 | + Read more in the [User Guide](/PyEyesWeb/user_guide/theoretical_framework/analysis_primitives/clusterability/) |
| 29 | +
|
| 30 | + References |
| 31 | + ---------- |
| 32 | + Lawson, R. G., & Jurs, P. C. (1990). New index for clustering tendency and its application to chemical problems. |
| 33 | + Journal of chemical information and computer sciences, 30(1), 36-41. |
| 34 | + """ |
| 35 | + |
| 36 | + def __init__(self, n_neighbors: int) -> None: |
| 37 | + """ |
| 38 | + Initialize the Clusterability object. |
| 39 | +
|
| 40 | + Parameters |
| 41 | + ---------- |
| 42 | + n_neighbors : int |
| 43 | + Number of nearest neighbors to use in the Hopkins statistic computation. |
| 44 | + random_state : int, optional |
| 45 | + Random seed for reproducibility. Default is None. |
| 46 | + """ |
| 47 | + self.n_neighbors = n_neighbors |
| 48 | + |
| 49 | + def compute_hopkins_statistic(self, data: np.ndarray) -> float: |
| 50 | + """ |
| 51 | + Compute the Hopkins statistic for a given dataset. |
| 52 | +
|
| 53 | + Parameters |
| 54 | + ---------- |
| 55 | + data : np.ndarray |
| 56 | + Input data of shape (n_samples, n_features). |
| 57 | +
|
| 58 | + Returns |
| 59 | + ------- |
| 60 | + float |
| 61 | + Hopkins statistic value. Returns NaN if data is insufficient or invalid. |
| 62 | + """ |
| 63 | + if data.shape[0] < 5: |
| 64 | + return np.nan |
| 65 | + |
| 66 | + # Generate uniform random sample within data bounds |
| 67 | + mins = np.min(data, axis=0) |
| 68 | + maxs = np.max(data, axis=0) |
| 69 | + |
| 70 | + # Uniform random sample |
| 71 | + uniform_sample = np.random.uniform(mins, maxs, size=data.shape) |
| 72 | + |
| 73 | + # Compute nearest neighbor distances |
| 74 | + n_neighbors = min(data.shape[0], self.n_neighbors) |
| 75 | + neighbors = NearestNeighbors(n_neighbors=n_neighbors).fit(data) |
| 76 | + |
| 77 | + # Distances from data points to their nearest neighbors |
| 78 | + data_distances, _ = neighbors.kneighbors(data) |
| 79 | + u = np.sum(data_distances[:, 1]) # exclude self-distance (0) |
| 80 | + |
| 81 | + # Distances from uniform sample points to their nearest neighbors |
| 82 | + uniform_distances, _ = neighbors.kneighbors(uniform_sample) |
| 83 | + w = np.sum(uniform_distances[:, 0]) |
| 84 | + |
| 85 | + hopkins_stat = w / (u + w + 1e-10) |
| 86 | + return float(hopkins_stat) |
| 87 | + |
| 88 | + def compute_clusterability(self, signals: SlidingWindow) -> Dict[str, float]: |
| 89 | + """ |
| 90 | + Compute the clusterability of a sliding window of signals using the Hopkins statistic. |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + signals : SlidingWindow |
| 95 | + A sliding window object containing signal data. |
| 96 | +
|
| 97 | + Returns |
| 98 | + ------- |
| 99 | + dict |
| 100 | + Dictionary containing: |
| 101 | + - 'hopkins_statistic' (float): Computed Hopkins statistic. |
| 102 | + Returns NaN if the window is not full or computation fails. |
| 103 | + """ |
| 104 | + if not signals.is_full(): |
| 105 | + return {"hopkins_statistic": np.nan} |
| 106 | + |
| 107 | + try: |
| 108 | + data, _ = signals.to_array() |
| 109 | + hopkins_value = self.compute_hopkins_statistic(data) |
| 110 | + except Exception: |
| 111 | + # TODO: add logging for better traceability |
| 112 | + hopkins_value = np.nan |
| 113 | + |
| 114 | + return {"hopkins_statistic": hopkins_value} |
| 115 | + |
| 116 | + def __call__(self, sliding_window: SlidingWindow) -> Dict[str, float]: |
| 117 | + """ |
| 118 | + Callable interface to compute clusterability directly on a SlidingWindow instance. |
| 119 | +
|
| 120 | + Parameters |
| 121 | + ---------- |
| 122 | + sliding_window : SlidingWindow |
| 123 | + The sliding window object containing the data. |
| 124 | +
|
| 125 | + Returns |
| 126 | + ------- |
| 127 | + dict |
| 128 | + Output of `compute_clusterability`. |
| 129 | + """ |
| 130 | + return self.compute_clusterability(sliding_window) |
0 commit comments