1- import numba
21import numpy as np
32from scipy .sparse import csr_array
43from sklearn .utils ._param_validation import validate_params
@@ -23,9 +22,11 @@ def rand_binary_similarity(
2322
2423 .. math::
2524
26- sim(a, b) = \frac{|a \cap b |}{n}
25+ sim(a, b) = \frac{|a+d |}{n}
2726
28- where `n` is the length of vector `a`.
27+ - :math:`a` - both are 1 (:math:`|x \cap y|`, common "on" bits)
28+ - :math:`d` - both are 0 (:math:`~|x \cap y|`, common "off" bits)
29+ - :math:`n` - length of passed vectors
2930
3031 The calculated similarity falls within the range :math:`[0, 1]`.
3132 Passing all-zero vectors to this function results in a similarity of 0.
@@ -63,14 +64,14 @@ def rand_binary_similarity(
6364 >>> from skfp.distances import rand_binary_similarity
6465 >>> import numpy as np
6566 >>> vec_a = np.array([1, 0, 1])
66- >>> vec_b = np.array([1, 0, 1 ])
67+ >>> vec_b = np.array([1, 0, 0 ])
6768 >>> sim = rand_binary_similarity(vec_a, vec_b)
6869 >>> sim
6970 0.6666666666666666
7071
7172 >>> from scipy.sparse import csr_array
7273 >>> vec_a = csr_array([[1, 0, 1]])
73- >>> vec_b = csr_array([[1, 0, 1 ]])
74+ >>> vec_b = csr_array([[1, 0, 0 ]])
7475 >>> sim = rand_binary_similarity(vec_a, vec_b)
7576 >>> sim
7677 0.6666666666666666
@@ -81,16 +82,22 @@ def rand_binary_similarity(
8182 f"got { type (vec_a )} and { type (vec_b )} "
8283 )
8384
84- if isinstance (vec_a , (np .ndarray , list )):
85- num_common = np .sum (np .logical_and (vec_a , vec_b ))
85+ if isinstance (vec_a , list ):
86+ vec_a = np .array (vec_a )
87+ vec_b = np .array (vec_b )
88+
89+ if isinstance (vec_a , np .ndarray ):
90+ a = np .sum (np .logical_and (vec_a , vec_b ))
91+ d = np .sum (np .logical_and (1 - vec_a , 1 - vec_b )) # type: ignore
8692 length = len (vec_a )
8793 else :
88- vec_a_idxs = set (vec_a .indices )
89- vec_b_idxs = set (vec_b .indices )
90- num_common = len (vec_a_idxs & vec_b_idxs )
9194 length = vec_a .shape [1 ]
95+ vec_a_idxs = set (vec_a .indices )
96+ vec_b_idxs = set (vec_b .indices ) # type: ignore
97+ a = len (vec_a_idxs & vec_b_idxs )
98+ d = length - (vec_a .nnz + vec_b .nnz - a ) # type: ignore
9299
93- rand_sim = num_common / length
100+ rand_sim = ( a + d ) / length
94101 return float (rand_sim )
95102
96103
@@ -152,14 +159,14 @@ def rand_binary_distance(
152159 >>> from skfp.distances import rand_binary_distance
153160 >>> import numpy as np
154161 >>> vec_a = np.array([1, 0, 1])
155- >>> vec_b = np.array([1, 0, 1 ])
162+ >>> vec_b = np.array([1, 0, 0 ])
156163 >>> dist = rand_binary_distance(vec_a, vec_b)
157164 >>> dist
158165 0.33333333333333337
159166
160167 >>> from scipy.sparse import csr_array
161168 >>> vec_a = csr_array([[1, 0, 1]])
162- >>> vec_b = csr_array([[1, 0, 1 ]])
169+ >>> vec_b = csr_array([[1, 0, 0 ]])
163170 >>> dist = rand_binary_distance(vec_a, vec_b)
164171 >>> dist
165172 0.33333333333333337
@@ -168,52 +175,40 @@ def rand_binary_distance(
168175
169176
170177@validate_params (
171- {"X" : ["array-like" ], "Y" : ["array-like" , None ]},
178+ {
179+ "X" : ["array-like" , csr_array ],
180+ "Y" : ["array-like" , csr_array , None ],
181+ },
172182 prefer_skip_nested_validation = True ,
173183)
174184def bulk_rand_binary_similarity (
175- X : np .ndarray , Y : np .ndarray | None = None
185+ X : list | np .ndarray | csr_array , Y : list | np .ndarray | csr_array | None = None
176186) -> np .ndarray :
177187 r"""
178188 Bulk Rand similarity for binary matrices.
179189
180- Computes the pairwise Rand [1]_ [2]_ ( known as All-Bit [3]_ or Sokal-Michener)
181- similarity between binary matrices. If one array is passed, similarities are
182- computed between its rows. For two arrays, similarities are between their respective
183- rows, with `i`-th row and `j`-th column in output corresponding to `i`-th row from
184- first array and `j`-th row from second array.
190+ Computes the pairwise Rand (also known as All-Bit or Sokal-Michener) similarity
191+ between binary matrices. If one array is passed, similarities are computed between
192+ its rows. For two arrays, similarities are between their respective rows, with
193+ `i`-th row and `j`-th column in output corresponding to `i`-th row from first array
194+ and `j`-th row from second array.
185195
186196 See also :py:func:`rand_binary_similarity`.
187197
188198 Parameters
189199 ----------
190- X : ndarray
191- First binary input array, of shape :math:`m \times m `
200+ X : ndarray or CSR sparse array
201+ First binary input array, of shape :math:`m \times d `
192202
193- Y : ndarray, default=None
194- Second binary input array, of shape :math:`n \times n `. If not passed, similarities
195- are computed between rows of X.
203+ Y : ndarray or CSR sparse array , default=None
204+ Second binary input array, of shape :math:`n \times d `. If not passed,
205+ similarities are computed between rows of X.
196206
197207 Returns
198208 -------
199209 similarities : ndarray
200- Array with pairwise Rand similarity values. Shape is :math:`m \times n` if two
201- arrays are passed, or :math:`m \times m` otherwise.
202-
203- References
204- ----------
205- .. [1] `Rand, W.M.
206- "Objective criteria for the evaluation of clustering methods."
207- J. Amer. Stat. Assoc. 1971; 66: 846–850.
208- <https://www.tandfonline.com/doi/abs/10.1080/01621459.1971.10482356>`_
209-
210- .. [2] `Deza M.M., Deza E.
211- "Encyclopedia of Distances."
212- Springer, Berlin, Heidelberg, 2009.
213- <https://doi.org/10.1007/978-3-642-00234-2_1>`_
214-
215- .. [3] `RDKit documentation
216- <https://www.rdkit.org/docs/source/rdkit.DataStructs.cDataStructs.html>`_
210+ Array with pairwise Rand similarity values. Shape is :math:`m \times n`
211+ if two arrays are passed, or :math:`m \times m` otherwise.
217212
218213 See Also
219214 --------
@@ -227,40 +222,48 @@ def bulk_rand_binary_similarity(
227222 >>> Y = np.array([[1, 0, 1], [0, 1, 1]])
228223 >>> sim = bulk_rand_binary_similarity(X, Y)
229224 >>> sim
230- array([[0.66666667 , 0.33333333],
231- [0.33333333 , 0.33333333 ]])
225+ array([[1. , 0.33333333],
226+ [0.66666667 , 0.66666667 ]])
232227 """
228+ if not isinstance (X , csr_array ):
229+ X = csr_array (X )
230+
233231 if Y is None :
234232 return _bulk_rand_binary_similarity_single (X )
235233 else :
234+ if not isinstance (Y , csr_array ):
235+ Y = csr_array (Y )
236236 return _bulk_rand_binary_similarity_two (X , Y )
237237
238238
239- @numba .njit (parallel = True )
240- def _bulk_rand_binary_similarity_single (X : np .ndarray ) -> np .ndarray :
241- m , length = X .shape
242- sims = np .empty ((m , m ))
239+ def _bulk_rand_binary_similarity_single (X : csr_array ) -> np .ndarray :
240+ n_features = X .shape [1 ]
243241
244- for i in numba .prange (m ):
245- for j in numba .prange (i , m ):
246- intersection = np .sum (np .logical_and (X [i ], X [j ]))
247- sim = intersection / length
248- sims [i , j ] = sims [j , i ] = sim
242+ a = (X @ X .T ).toarray ()
249243
244+ row_sums = np .asarray (X .sum (axis = 1 )).ravel ()
245+ sum_A = row_sums [:, None ]
246+ sum_B = row_sums [None , :]
247+
248+ d = n_features - (sum_A + sum_B - a )
249+
250+ sims = (a + d ) / n_features
250251 return sims
251252
252253
253- @numba .njit (parallel = True )
254- def _bulk_rand_binary_similarity_two (X : np .ndarray , Y : np .ndarray ) -> np .ndarray :
255- m , length = X .shape
256- n = Y .shape [0 ]
257- sims = np .empty ((m , n ))
254+ def _bulk_rand_binary_similarity_two (X : csr_array , Y : csr_array ) -> np .ndarray :
255+ n_features = X .shape [1 ]
256+
257+ a = (X @ Y .T ).toarray ()
258+
259+ row_sums_X = np .asarray (X .sum (axis = 1 )).ravel ()
260+ row_sums_Y = np .asarray (Y .sum (axis = 1 )).ravel ()
261+ sum_A = row_sums_X [:, None ]
262+ sum_B = row_sums_Y [None , :]
258263
259- for i in numba .prange (m ):
260- for j in numba .prange (n ):
261- intersection = np .sum (np .logical_and (X [i ], Y [j ]))
262- sims [i , j ] = intersection / length
264+ d = n_features - (sum_A + sum_B - a )
263265
266+ sims = (a + d ) / n_features
264267 return sims
265268
266269
@@ -271,51 +274,54 @@ def _bulk_rand_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray
271274 },
272275 prefer_skip_nested_validation = True ,
273276)
274- def bulk_rand_binary_distance (X : np .ndarray , Y : np .ndarray | None = None ) -> np .ndarray :
277+ def bulk_rand_binary_distance (
278+ X : list | np .ndarray | csr_array , Y : list | np .ndarray | csr_array | None = None
279+ ) -> np .ndarray :
275280 r"""
276281 Bulk Rand distance for vectors of binary values.
277282
278- Computes the pairwise Rand distance between binary matrices. If one array is
279- passed, distances are computed between its rows. For two arrays, distances
280- are between their respective rows, with `i`-th row and `j`-th column in output
281- corresponding to `i`-th row from first array and `j`-th row from second array.
283+ Computes the pairwise Rand distance between binary matrices. If one array
284+ is passed, distances are computed between its rows. For two arrays,
285+ distances are between their respective rows, with `i`-th row and `j`-th
286+ column in output corresponding to `i`-th row from first array and `j`-th
287+ row from second array.
282288
283289 See also :py:func:`rand_binary_distance`.
284290
285291 Parameters
286292 ----------
287- X : ndarray
288- First binary input array, of shape :math:`m \times m `
293+ X : ndarray or CSR sparse array
294+ First binary input array, of shape :math:`m \times d `
289295
290- Y : ndarray, default=None
291- Second binary input array, of shape :math:`n \times n `. If not passed, distances
292- are computed between rows of X.
296+ Y : ndarray or CSR sparse array , default=None
297+ Second binary input array, of shape :math:`n \times d `. If not passed,
298+ distances are computed between rows of X.
293299
294300 Returns
295301 -------
296302 distances : ndarray
297- Array with pairwise Rand distance values. Shape is :math:`m \times n` if two
298- arrays are passed, or :math:`m \times m` otherwise.
303+ Array with pairwise Rand distance values. Shape is :math:`m \times n` if
304+ two arrays are passed, or :math:`m \times m` otherwise.
299305
300306 See Also
301307 --------
302- :py:func:`rand_binary_distance` : Rand distance function for two vectors
308+ :py:func:`rand_binary_distance` : Rand distance function for two vectors.
303309
304310 Examples
305311 --------
306312 >>> from skfp.distances import bulk_rand_binary_distance
307313 >>> import numpy as np
308314 >>> X = np.array([[1, 0, 1], [1, 0, 1]])
309- >>> Y = np.array([[1, 0, 1 ], [1, 0, 1 ]])
315+ >>> Y = np.array([[1, 0, 0 ], [1, 0, 0 ]])
310316 >>> dist = bulk_rand_binary_distance(X, Y)
311317 >>> dist
312318 array([[0.33333333, 0.33333333],
313319 [0.33333333, 0.33333333]])
314320
315- >>> X = np.array([[1, 0, 1], [1, 0, 1 ]])
321+ >>> X = np.array([[1, 0, 1], [1, 0, 0 ]])
316322 >>> dist = bulk_rand_binary_distance(X)
317323 >>> dist
318- array([[0.33333333 , 0.33333333],
319- [0.33333333, 0.33333333 ]])
324+ array([[0. , 0.33333333],
325+ [0.33333333, 0. ]])
320326 """
321327 return 1 - bulk_rand_binary_similarity (X , Y )
0 commit comments