@@ -16,14 +16,23 @@ def _reduce_pca(
1616 from sklearn .decomposition import PCA
1717 from sklearn .preprocessing import StandardScaler
1818
19+ # Remove NaN rows
20+ non_nan_data = data .dropna ()
21+
1922 if scale :
20- data = StandardScaler ().fit_transform (data .values )
23+ preprocessed = StandardScaler ().fit_transform (non_nan_data .values )
2124 else :
22- data = data .values
25+ preprocessed = non_nan_data .values
2326
2427 pca = PCA (n_components = n_components )
25- pca .fit (data )
26- return pca .transform (data )
28+ pca .fit (preprocessed )
29+ reduced_data = pca .transform (preprocessed )
30+
31+ # Add NaN rows back
32+ result = pd .DataFrame (index = data .index , columns = range (n_components ))
33+ result .loc [non_nan_data .index ] = reduced_data
34+
35+ return result
2736
2837 return _reduce_pca (data , n_components , scale )
2938
@@ -45,11 +54,21 @@ def _reduce_tsne(
4554 from sklearn .manifold import TSNE
4655 from sklearn .preprocessing import StandardScaler
4756
48- print ("working on tsne" )
57+ # Remove NaN rows
58+ non_nan_data = data .dropna ()
59+
4960 if scale :
50- data = StandardScaler ().fit_transform (data )
61+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
62+ else :
63+ preprocessed = non_nan_data .values
5164 tsne = TSNE (n_components = n_components , perplexity = perplexity )
52- return tsne .fit_transform (data )
65+ reduced_data = tsne .fit_transform (preprocessed )
66+
67+ # Add NaN rows back
68+ result = pd .DataFrame (index = data .index , columns = range (n_components ))
69+ result .loc [non_nan_data .index ] = reduced_data
70+
71+ return result
5372
5473 return _reduce_tsne (data , n_components , perplexity , scale )
5574
@@ -71,11 +90,22 @@ def _reduce_umap(
7190 import umap
7291 from sklearn .preprocessing import StandardScaler
7392
93+ # Remove NaN rows
94+ non_nan_data = data .dropna ()
95+
7496 if scale :
75- data = StandardScaler ().fit_transform (data )
97+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
98+ else :
99+ preprocessed = non_nan_data .values
76100
77101 reducer = umap .UMAP (n_components = n_components , n_neighbors = n_neighbors )
78- return reducer .fit_transform (data )
102+ reduced_data = reducer .fit_transform (preprocessed )
103+
104+ # Add NaN rows back
105+ result = pd .DataFrame (index = data .index , columns = range (n_components ))
106+ result .loc [non_nan_data .index ] = reduced_data
107+
108+ return result
79109
80110 return _reduce_umap (data , n_components , n_neighbors , scale )
81111
@@ -94,11 +124,22 @@ def _cluster_kmeans(
94124 from sklearn .cluster import KMeans
95125 from sklearn .preprocessing import StandardScaler
96126
127+ # Remove NaN rows
128+ non_nan_data = data .dropna ()
129+
97130 if scale :
98- data = StandardScaler ().fit_transform (data )
131+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
132+ else :
133+ preprocessed = non_nan_data .values
99134
100135 kmeans = KMeans (n_clusters = n_clusters )
101- return kmeans .fit_predict (data )
136+ clusters = kmeans .fit_predict (preprocessed )
137+
138+ # Add NaN rows back
139+ result = pd .Series (index = data .index , dtype = int )
140+ result .loc [non_nan_data .index ] = clusters
141+
142+ return result
102143
103144 return _cluster_kmeans (data , n_clusters , scale )
104145
@@ -123,13 +164,24 @@ def _cluster_hdbscan(
123164 from sklearn .cluster import HDBSCAN
124165 from sklearn .preprocessing import StandardScaler
125166
167+ # Remove NaN rows
168+ non_nan_data = data .dropna ()
169+
126170 if scale :
127- data = StandardScaler ().fit_transform (data )
171+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
172+ else :
173+ preprocessed = non_nan_data .values
128174
129175 clusterer = HDBSCAN (
130176 min_cluster_size = min_cluster_size , min_samples = min_samples
131177 )
132- return clusterer .fit_predict (data )
178+ clusters = clusterer .fit_predict (preprocessed )
179+
180+ # Add NaN rows back
181+ result = pd .Series (index = data .index , dtype = int )
182+ result .loc [non_nan_data .index ] = clusters
183+
184+ return result
133185
134186 return _cluster_hdbscan (data , min_cluster_size , min_samples , scale )
135187
@@ -148,11 +200,22 @@ def _cluster_gaussian_mixture(
148200 from sklearn .mixture import GaussianMixture
149201 from sklearn .preprocessing import StandardScaler
150202
203+ # Remove NaN rows
204+ non_nan_data = data .dropna ()
205+
151206 if scale :
152- data = StandardScaler ().fit_transform (data )
207+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
208+ else :
209+ preprocessed = non_nan_data .values
153210
154211 gmm = GaussianMixture (n_components = n_components )
155- return gmm .fit_predict (data )
212+ clusters = gmm .fit_predict (preprocessed )
213+
214+ # Add NaN rows back
215+ result = pd .Series (index = data .index , dtype = int )
216+ result .loc [non_nan_data .index ] = clusters
217+
218+ return result
156219
157220 return _cluster_gaussian_mixture (data , n_components , scale )
158221
@@ -171,10 +234,21 @@ def _cluster_spectral(
171234 from sklearn .cluster import SpectralClustering
172235 from sklearn .preprocessing import StandardScaler
173236
237+ # Remove NaN rows
238+ non_nan_data = data .dropna ()
239+
174240 if scale :
175- data = StandardScaler ().fit_transform (data )
241+ preprocessed = StandardScaler ().fit_transform (non_nan_data )
242+ else :
243+ preprocessed = non_nan_data .values
176244
177245 clusterer = SpectralClustering (n_clusters = n_clusters )
178- return clusterer .fit_predict (data )
246+ clusters = clusterer .fit_predict (preprocessed )
247+
248+ # Add NaN rows back
249+ result = pd .Series (index = data .index , dtype = int )
250+ result .loc [non_nan_data .index ] = clusters
251+
252+ return result
179253
180254 return _cluster_spectral (data , n_clusters , scale )
0 commit comments