@@ -104,7 +104,10 @@ def compute_distances(self, x1, x2=None):
104104 return self .distance_by_cols (x1 , self .fit_params )
105105 else :
106106 return self .distance_by_rows (
107- x1 , x2 if x2 is not None else x1 , self .fit_params )
107+ x1 ,
108+ x2 if x2 is not None else x1 ,
109+ x2 is not None ,
110+ self .fit_params )
108111
109112
110113class FittedDistance (Distance ):
@@ -121,6 +124,13 @@ def fit(self, data):
121124 # pylint: disable=not-callable
122125 return self .ModelType (attributes , axis = self .axis , fit_params = fit_params )
123126
127+ def fit_cols (self , x , n_vals ):
128+ if any (n_vals ):
129+ raise ValueError ("columns with discrete values are incommensurable" )
130+
131+ def fit_rows (self , x , n_vals ):
132+ pass
133+
124134
125135class EuclideanModel (FittedDistanceModel ):
126136 name = "Euclidean"
@@ -137,6 +147,7 @@ def __new__(cls, *args, **kwargs):
137147 return super ().__new__ (cls , * args , ** kwargs )
138148
139149 def fit_rows (self , x , n_vals ):
150+ super ().fit_rows (x , n_vals )
140151 n_cols = len (n_vals )
141152 n_bins = max (n_vals )
142153 means = np .zeros (n_cols , dtype = float )
@@ -171,9 +182,7 @@ def fit_rows(self, x, n_vals):
171182 normalize = int (self .normalize ))
172183
173184 def fit_cols (self , x , n_vals ):
174- if any (n_vals ):
175- raise ValueError (
176- "columns with discrete values are not commensurate" )
185+ super ().fit_cols (x , n_vals )
177186 means = np .nanmean (x , axis = 0 )
178187 vars = np .nanvar (x , axis = 0 )
179188 if np .isnan (vars ).any () or not vars .all ():
@@ -196,6 +205,7 @@ def __new__(cls, *args, **kwargs):
196205 return super ().__new__ (cls , * args , ** kwargs )
197206
198207 def fit_rows (self , x , n_vals ):
208+ super ().fit_rows (x , n_vals )
199209 n_cols = len (n_vals )
200210 n_bins = max (n_vals )
201211
@@ -227,9 +237,7 @@ def fit_rows(self, x, n_vals):
227237 normalize = int (self .normalize ))
228238
229239 def fit_cols (self , x , n_vals ):
230- if any (n_vals ):
231- raise ValueError (
232- "columns with discrete values are not commensurate" )
240+ super ().fit_cols (x , n_vals )
233241 medians = np .nanmedian (x , axis = 0 )
234242 mads = np .nanmedian (np .abs (x - medians ), axis = 0 )
235243 if np .isnan (mads ).any () or not mads .all ():
@@ -239,6 +247,49 @@ def fit_cols(self, x, n_vals):
239247 return dict (medians = medians , mads = mads , normalize = int (self .normalize ))
240248
241249
250+ class CosineModel (FittedDistanceModel ):
251+ supports_sparse = False
252+ distance_by_rows = _distance .cosine_rows
253+ distance_by_cols = _distance .cosine_cols
254+
255+
256+ class Cosine (FittedDistance ):
257+ ModelType = CosineModel
258+
259+ def __new__ (cls , * args , ** kwargs ):
260+ kwargs .setdefault ("normalize" , False )
261+ return super ().__new__ (cls , * args , ** kwargs )
262+
263+ def fit_rows (self , x , n_vals ):
264+ super ().fit_rows (x , n_vals )
265+ n , n_cols = x .shape
266+ means = np .zeros (n_cols , dtype = float )
267+ vars = np .empty (n_cols , dtype = float )
268+ dist_missing2 = np .zeros (n_cols , dtype = float )
269+
270+ for col in range (n_cols ):
271+ column = x [:, col ]
272+ if n_vals [col ]:
273+ vars [col ] = - 1
274+ nonnans = n - np .sum (np .isnan (column ))
275+ means [col ] = 1 - np .sum (column == 0 ) / nonnans
276+ dist_missing2 [col ] = means [col ]
277+ elif np .isnan (column ).all (): # avoid warnings in nanmean and nanvar
278+ vars [col ] = - 2
279+ else :
280+ means [col ] = util .nanmean (column )
281+ vars [col ] = util .nanvar (column )
282+ if vars [col ] == 0 :
283+ vars [col ] = - 2
284+ dist_missing2 [col ] = means [col ] ** 2
285+ if np .isnan (dist_missing2 [col ]):
286+ dist_missing2 [col ] = 0
287+
288+ return dict (means = means , vars = vars , dist_missing2 = dist_missing2 )
289+
290+ fit_cols = fit_rows
291+
292+
242293class JaccardModel (FittedDistanceModel ):
243294 supports_sparse = False
244295 distance_by_cols = _distance .jaccard_cols
@@ -248,17 +299,14 @@ class JaccardModel(FittedDistanceModel):
248299class Jaccard (FittedDistance ):
249300 ModelType = JaccardModel
250301 name = "Jaccard"
251- fit_rows = fit_cols = _distance .fit_jaccard
252-
253-
254- class CosineModel (EuclideanModel ):
255- def compute_distances (self , x1 , x2 = None ):
256- return 1 - np .cos (1 - super ().compute_distances (x1 , x2 ))
257302
303+ def fit_rows (self , x , n_vals ):
304+ return {
305+ "ps" : np .fromiter (
306+ (_distance .p_nonzero (x [:, col ]) for col in range (len (n_vals ))),
307+ dtype = np .double , count = len (n_vals ))}
258308
259- class Cosine (Euclidean ):
260- ModelType = CosineModel
261- name = "Cosine"
309+ fit_cols = fit_rows
262310
263311
264312class SpearmanDistance (Distance ):
0 commit comments