Skip to content

Commit 3be38fe

Browse files
authored
Fix Rand similarity (#492)
1 parent 3bd9cc6 commit 3be38fe

File tree

2 files changed

+120
-133
lines changed

2 files changed

+120
-133
lines changed

skfp/distances/rand.py

Lines changed: 85 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import numba
21
import numpy as np
32
from scipy.sparse import csr_array
43
from sklearn.utils._param_validation import validate_params
@@ -23,9 +22,11 @@ def rand_binary_similarity(
2322
2423
.. math::
2524
26-
sim(a, b) = \frac{|a \cap b|}{n}
25+
sim(a, b) = \frac{|a+d|}{n}
2726
28-
where `n` is the length of vector `a`.
27+
- :math:`a` - both are 1 (:math:`|x \cap y|`, common "on" bits)
28+
- :math:`d` - both are 0 (:math:`~|x \cap y|`, common "off" bits)
29+
- :math:`n` - length of passed vectors
2930
3031
The calculated similarity falls within the range :math:`[0, 1]`.
3132
Passing all-zero vectors to this function results in a similarity of 0.
@@ -63,14 +64,14 @@ def rand_binary_similarity(
6364
>>> from skfp.distances import rand_binary_similarity
6465
>>> import numpy as np
6566
>>> vec_a = np.array([1, 0, 1])
66-
>>> vec_b = np.array([1, 0, 1])
67+
>>> vec_b = np.array([1, 0, 0])
6768
>>> sim = rand_binary_similarity(vec_a, vec_b)
6869
>>> sim
6970
0.6666666666666666
7071
7172
>>> from scipy.sparse import csr_array
7273
>>> vec_a = csr_array([[1, 0, 1]])
73-
>>> vec_b = csr_array([[1, 0, 1]])
74+
>>> vec_b = csr_array([[1, 0, 0]])
7475
>>> sim = rand_binary_similarity(vec_a, vec_b)
7576
>>> sim
7677
0.6666666666666666
@@ -81,16 +82,22 @@ def rand_binary_similarity(
8182
f"got {type(vec_a)} and {type(vec_b)}"
8283
)
8384

84-
if isinstance(vec_a, (np.ndarray, list)):
85-
num_common = np.sum(np.logical_and(vec_a, vec_b))
85+
if isinstance(vec_a, list):
86+
vec_a = np.array(vec_a)
87+
vec_b = np.array(vec_b)
88+
89+
if isinstance(vec_a, np.ndarray):
90+
a = np.sum(np.logical_and(vec_a, vec_b))
91+
d = np.sum(np.logical_and(1 - vec_a, 1 - vec_b)) # type: ignore
8692
length = len(vec_a)
8793
else:
88-
vec_a_idxs = set(vec_a.indices)
89-
vec_b_idxs = set(vec_b.indices)
90-
num_common = len(vec_a_idxs & vec_b_idxs)
9194
length = vec_a.shape[1]
95+
vec_a_idxs = set(vec_a.indices)
96+
vec_b_idxs = set(vec_b.indices) # type: ignore
97+
a = len(vec_a_idxs & vec_b_idxs)
98+
d = length - (vec_a.nnz + vec_b.nnz - a) # type: ignore
9299

93-
rand_sim = num_common / length
100+
rand_sim = (a + d) / length
94101
return float(rand_sim)
95102

96103

@@ -152,14 +159,14 @@ def rand_binary_distance(
152159
>>> from skfp.distances import rand_binary_distance
153160
>>> import numpy as np
154161
>>> vec_a = np.array([1, 0, 1])
155-
>>> vec_b = np.array([1, 0, 1])
162+
>>> vec_b = np.array([1, 0, 0])
156163
>>> dist = rand_binary_distance(vec_a, vec_b)
157164
>>> dist
158165
0.33333333333333337
159166
160167
>>> from scipy.sparse import csr_array
161168
>>> vec_a = csr_array([[1, 0, 1]])
162-
>>> vec_b = csr_array([[1, 0, 1]])
169+
>>> vec_b = csr_array([[1, 0, 0]])
163170
>>> dist = rand_binary_distance(vec_a, vec_b)
164171
>>> dist
165172
0.33333333333333337
@@ -168,52 +175,40 @@ def rand_binary_distance(
168175

169176

170177
@validate_params(
171-
{"X": ["array-like"], "Y": ["array-like", None]},
178+
{
179+
"X": ["array-like", csr_array],
180+
"Y": ["array-like", csr_array, None],
181+
},
172182
prefer_skip_nested_validation=True,
173183
)
174184
def bulk_rand_binary_similarity(
175-
X: np.ndarray, Y: np.ndarray | None = None
185+
X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None
176186
) -> np.ndarray:
177187
r"""
178188
Bulk Rand similarity for binary matrices.
179189
180-
Computes the pairwise Rand [1]_ [2]_ (known as All-Bit [3]_ or Sokal-Michener)
181-
similarity between binary matrices. If one array is passed, similarities are
182-
computed between its rows. For two arrays, similarities are between their respective
183-
rows, with `i`-th row and `j`-th column in output corresponding to `i`-th row from
184-
first array and `j`-th row from second array.
190+
Computes the pairwise Rand (also known as All-Bit or Sokal-Michener) similarity
191+
between binary matrices. If one array is passed, similarities are computed between
192+
its rows. For two arrays, similarities are between their respective rows, with
193+
`i`-th row and `j`-th column in output corresponding to `i`-th row from first array
194+
and `j`-th row from second array.
185195
186196
See also :py:func:`rand_binary_similarity`.
187197
188198
Parameters
189199
----------
190-
X : ndarray
191-
First binary input array, of shape :math:`m \times m`
200+
X : ndarray or CSR sparse array
201+
First binary input array, of shape :math:`m \times d`
192202
193-
Y : ndarray, default=None
194-
Second binary input array, of shape :math:`n \times n`. If not passed, similarities
195-
are computed between rows of X.
203+
Y : ndarray or CSR sparse array, default=None
204+
Second binary input array, of shape :math:`n \times d`. If not passed,
205+
similarities are computed between rows of X.
196206
197207
Returns
198208
-------
199209
similarities : ndarray
200-
Array with pairwise Rand similarity values. Shape is :math:`m \times n` if two
201-
arrays are passed, or :math:`m \times m` otherwise.
202-
203-
References
204-
----------
205-
.. [1] `Rand, W.M.
206-
"Objective criteria for the evaluation of clustering methods."
207-
J. Amer. Stat. Assoc. 1971; 66: 846–850.
208-
<https://www.tandfonline.com/doi/abs/10.1080/01621459.1971.10482356>`_
209-
210-
.. [2] `Deza M.M., Deza E.
211-
"Encyclopedia of Distances."
212-
Springer, Berlin, Heidelberg, 2009.
213-
<https://doi.org/10.1007/978-3-642-00234-2_1>`_
214-
215-
.. [3] `RDKit documentation
216-
<https://www.rdkit.org/docs/source/rdkit.DataStructs.cDataStructs.html>`_
210+
Array with pairwise Rand similarity values. Shape is :math:`m \times n`
211+
if two arrays are passed, or :math:`m \times m` otherwise.
217212
218213
See Also
219214
--------
@@ -227,40 +222,48 @@ def bulk_rand_binary_similarity(
227222
>>> Y = np.array([[1, 0, 1], [0, 1, 1]])
228223
>>> sim = bulk_rand_binary_similarity(X, Y)
229224
>>> sim
230-
array([[0.66666667, 0.33333333],
231-
[0.33333333, 0.33333333]])
225+
array([[1. , 0.33333333],
226+
[0.66666667, 0.66666667]])
232227
"""
228+
if not isinstance(X, csr_array):
229+
X = csr_array(X)
230+
233231
if Y is None:
234232
return _bulk_rand_binary_similarity_single(X)
235233
else:
234+
if not isinstance(Y, csr_array):
235+
Y = csr_array(Y)
236236
return _bulk_rand_binary_similarity_two(X, Y)
237237

238238

239-
@numba.njit(parallel=True)
240-
def _bulk_rand_binary_similarity_single(X: np.ndarray) -> np.ndarray:
241-
m, length = X.shape
242-
sims = np.empty((m, m))
239+
def _bulk_rand_binary_similarity_single(X: csr_array) -> np.ndarray:
240+
n_features = X.shape[1]
243241

244-
for i in numba.prange(m):
245-
for j in numba.prange(i, m):
246-
intersection = np.sum(np.logical_and(X[i], X[j]))
247-
sim = intersection / length
248-
sims[i, j] = sims[j, i] = sim
242+
a = (X @ X.T).toarray()
249243

244+
row_sums = np.asarray(X.sum(axis=1)).ravel()
245+
sum_A = row_sums[:, None]
246+
sum_B = row_sums[None, :]
247+
248+
d = n_features - (sum_A + sum_B - a)
249+
250+
sims = (a + d) / n_features
250251
return sims
251252

252253

253-
@numba.njit(parallel=True)
254-
def _bulk_rand_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
255-
m, length = X.shape
256-
n = Y.shape[0]
257-
sims = np.empty((m, n))
254+
def _bulk_rand_binary_similarity_two(X: csr_array, Y: csr_array) -> np.ndarray:
255+
n_features = X.shape[1]
256+
257+
a = (X @ Y.T).toarray()
258+
259+
row_sums_X = np.asarray(X.sum(axis=1)).ravel()
260+
row_sums_Y = np.asarray(Y.sum(axis=1)).ravel()
261+
sum_A = row_sums_X[:, None]
262+
sum_B = row_sums_Y[None, :]
258263

259-
for i in numba.prange(m):
260-
for j in numba.prange(n):
261-
intersection = np.sum(np.logical_and(X[i], Y[j]))
262-
sims[i, j] = intersection / length
264+
d = n_features - (sum_A + sum_B - a)
263265

266+
sims = (a + d) / n_features
264267
return sims
265268

266269

@@ -271,51 +274,54 @@ def _bulk_rand_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray
271274
},
272275
prefer_skip_nested_validation=True,
273276
)
274-
def bulk_rand_binary_distance(X: np.ndarray, Y: np.ndarray | None = None) -> np.ndarray:
277+
def bulk_rand_binary_distance(
278+
X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None
279+
) -> np.ndarray:
275280
r"""
276281
Bulk Rand distance for vectors of binary values.
277282
278-
Computes the pairwise Rand distance between binary matrices. If one array is
279-
passed, distances are computed between its rows. For two arrays, distances
280-
are between their respective rows, with `i`-th row and `j`-th column in output
281-
corresponding to `i`-th row from first array and `j`-th row from second array.
283+
Computes the pairwise Rand distance between binary matrices. If one array
284+
is passed, distances are computed between its rows. For two arrays,
285+
distances are between their respective rows, with `i`-th row and `j`-th
286+
column in output corresponding to `i`-th row from first array and `j`-th
287+
row from second array.
282288
283289
See also :py:func:`rand_binary_distance`.
284290
285291
Parameters
286292
----------
287-
X : ndarray
288-
First binary input array, of shape :math:`m \times m`
293+
X : ndarray or CSR sparse array
294+
First binary input array, of shape :math:`m \times d`
289295
290-
Y : ndarray, default=None
291-
Second binary input array, of shape :math:`n \times n`. If not passed, distances
292-
are computed between rows of X.
296+
Y : ndarray or CSR sparse array, default=None
297+
Second binary input array, of shape :math:`n \times d`. If not passed,
298+
distances are computed between rows of X.
293299
294300
Returns
295301
-------
296302
distances : ndarray
297-
Array with pairwise Rand distance values. Shape is :math:`m \times n` if two
298-
arrays are passed, or :math:`m \times m` otherwise.
303+
Array with pairwise Rand distance values. Shape is :math:`m \times n` if
304+
two arrays are passed, or :math:`m \times m` otherwise.
299305
300306
See Also
301307
--------
302-
:py:func:`rand_binary_distance` : Rand distance function for two vectors
308+
:py:func:`rand_binary_distance` : Rand distance function for two vectors.
303309
304310
Examples
305311
--------
306312
>>> from skfp.distances import bulk_rand_binary_distance
307313
>>> import numpy as np
308314
>>> X = np.array([[1, 0, 1], [1, 0, 1]])
309-
>>> Y = np.array([[1, 0, 1], [1, 0, 1]])
315+
>>> Y = np.array([[1, 0, 0], [1, 0, 0]])
310316
>>> dist = bulk_rand_binary_distance(X, Y)
311317
>>> dist
312318
array([[0.33333333, 0.33333333],
313319
[0.33333333, 0.33333333]])
314320
315-
>>> X = np.array([[1, 0, 1], [1, 0, 1]])
321+
>>> X = np.array([[1, 0, 1], [1, 0, 0]])
316322
>>> dist = bulk_rand_binary_distance(X)
317323
>>> dist
318-
array([[0.33333333, 0.33333333],
319-
[0.33333333, 0.33333333]])
324+
array([[0. , 0.33333333],
325+
[0.33333333, 0. ]])
320326
"""
321327
return 1 - bulk_rand_binary_similarity(X, Y)

0 commit comments

Comments
 (0)