Skip to content

Commit 577e351

Browse files
authored
Merge pull request #362 from BiAPoL/handle-NaNs-correctly
Handle nans correctly
2 parents aa9e251 + 468b329 commit 577e351

File tree

2 files changed

+94
-17
lines changed

2 files changed

+94
-17
lines changed

src/napari_clusters_plotter/_algorithms.py

Lines changed: 91 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,23 @@ def _reduce_pca(
1616
from sklearn.decomposition import PCA
1717
from sklearn.preprocessing import StandardScaler
1818

19+
# Remove NaN rows
20+
non_nan_data = data.dropna()
21+
1922
if scale:
20-
data = StandardScaler().fit_transform(data.values)
23+
preprocessed = StandardScaler().fit_transform(non_nan_data.values)
2124
else:
22-
data = data.values
25+
preprocessed = non_nan_data.values
2326

2427
pca = PCA(n_components=n_components)
25-
pca.fit(data)
26-
return pca.transform(data)
28+
pca.fit(preprocessed)
29+
reduced_data = pca.transform(preprocessed)
30+
31+
# Add NaN rows back
32+
result = pd.DataFrame(index=data.index, columns=range(n_components))
33+
result.loc[non_nan_data.index] = reduced_data
34+
35+
return result
2736

2837
return _reduce_pca(data, n_components, scale)
2938

@@ -45,11 +54,21 @@ def _reduce_tsne(
4554
from sklearn.manifold import TSNE
4655
from sklearn.preprocessing import StandardScaler
4756

48-
print("working on tsne")
57+
# Remove NaN rows
58+
non_nan_data = data.dropna()
59+
4960
if scale:
50-
data = StandardScaler().fit_transform(data)
61+
preprocessed = StandardScaler().fit_transform(non_nan_data)
62+
else:
63+
preprocessed = non_nan_data.values
5164
tsne = TSNE(n_components=n_components, perplexity=perplexity)
52-
return tsne.fit_transform(data)
65+
reduced_data = tsne.fit_transform(preprocessed)
66+
67+
# Add NaN rows back
68+
result = pd.DataFrame(index=data.index, columns=range(n_components))
69+
result.loc[non_nan_data.index] = reduced_data
70+
71+
return result
5372

5473
return _reduce_tsne(data, n_components, perplexity, scale)
5574

@@ -71,11 +90,22 @@ def _reduce_umap(
7190
import umap
7291
from sklearn.preprocessing import StandardScaler
7392

93+
# Remove NaN rows
94+
non_nan_data = data.dropna()
95+
7496
if scale:
75-
data = StandardScaler().fit_transform(data)
97+
preprocessed = StandardScaler().fit_transform(non_nan_data)
98+
else:
99+
preprocessed = non_nan_data.values
76100

77101
reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors)
78-
return reducer.fit_transform(data)
102+
reduced_data = reducer.fit_transform(preprocessed)
103+
104+
# Add NaN rows back
105+
result = pd.DataFrame(index=data.index, columns=range(n_components))
106+
result.loc[non_nan_data.index] = reduced_data
107+
108+
return result
79109

80110
return _reduce_umap(data, n_components, n_neighbors, scale)
81111

@@ -94,11 +124,22 @@ def _cluster_kmeans(
94124
from sklearn.cluster import KMeans
95125
from sklearn.preprocessing import StandardScaler
96126

127+
# Remove NaN rows
128+
non_nan_data = data.dropna()
129+
97130
if scale:
98-
data = StandardScaler().fit_transform(data)
131+
preprocessed = StandardScaler().fit_transform(non_nan_data)
132+
else:
133+
preprocessed = non_nan_data.values
99134

100135
kmeans = KMeans(n_clusters=n_clusters)
101-
return kmeans.fit_predict(data)
136+
clusters = kmeans.fit_predict(preprocessed)
137+
138+
# Add NaN rows back
139+
result = pd.Series(index=data.index, dtype=int)
140+
result.loc[non_nan_data.index] = clusters
141+
142+
return result
102143

103144
return _cluster_kmeans(data, n_clusters, scale)
104145

@@ -123,13 +164,24 @@ def _cluster_hdbscan(
123164
from sklearn.cluster import HDBSCAN
124165
from sklearn.preprocessing import StandardScaler
125166

167+
# Remove NaN rows
168+
non_nan_data = data.dropna()
169+
126170
if scale:
127-
data = StandardScaler().fit_transform(data)
171+
preprocessed = StandardScaler().fit_transform(non_nan_data)
172+
else:
173+
preprocessed = non_nan_data.values
128174

129175
clusterer = HDBSCAN(
130176
min_cluster_size=min_cluster_size, min_samples=min_samples
131177
)
132-
return clusterer.fit_predict(data)
178+
clusters = clusterer.fit_predict(preprocessed)
179+
180+
# Add NaN rows back
181+
result = pd.Series(index=data.index, dtype=int)
182+
result.loc[non_nan_data.index] = clusters
183+
184+
return result
133185

134186
return _cluster_hdbscan(data, min_cluster_size, min_samples, scale)
135187

@@ -148,11 +200,22 @@ def _cluster_gaussian_mixture(
148200
from sklearn.mixture import GaussianMixture
149201
from sklearn.preprocessing import StandardScaler
150202

203+
# Remove NaN rows
204+
non_nan_data = data.dropna()
205+
151206
if scale:
152-
data = StandardScaler().fit_transform(data)
207+
preprocessed = StandardScaler().fit_transform(non_nan_data)
208+
else:
209+
preprocessed = non_nan_data.values
153210

154211
gmm = GaussianMixture(n_components=n_components)
155-
return gmm.fit_predict(data)
212+
clusters = gmm.fit_predict(preprocessed)
213+
214+
# Add NaN rows back
215+
result = pd.Series(index=data.index, dtype=int)
216+
result.loc[non_nan_data.index] = clusters
217+
218+
return result
156219

157220
return _cluster_gaussian_mixture(data, n_components, scale)
158221

@@ -171,10 +234,21 @@ def _cluster_spectral(
171234
from sklearn.cluster import SpectralClustering
172235
from sklearn.preprocessing import StandardScaler
173236

237+
# Remove NaN rows
238+
non_nan_data = data.dropna()
239+
174240
if scale:
175-
data = StandardScaler().fit_transform(data)
241+
preprocessed = StandardScaler().fit_transform(non_nan_data)
242+
else:
243+
preprocessed = non_nan_data.values
176244

177245
clusterer = SpectralClustering(n_clusters=n_clusters)
178-
return clusterer.fit_predict(data)
246+
clusters = clusterer.fit_predict(preprocessed)
247+
248+
# Add NaN rows back
249+
result = pd.Series(index=data.index, dtype=int)
250+
result.loc[non_nan_data.index] = clusters
251+
252+
return result
179253

180254
return _cluster_spectral(data, n_clusters, scale)

src/napari_clusters_plotter/_tests/test_dimensionality_reduction.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ def create_points(n_samples=100, loc=5):
3838
}
3939
)
4040

41+
# add some NaNs
42+
features.iloc[::10] = np.nan
43+
4144
layer = Points(points, features=features, size=0.1)
4245

4346
return layer

0 commit comments

Comments
 (0)