Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 85 additions & 79 deletions skfp/distances/rand.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import numba
import numpy as np
from scipy.sparse import csr_array
from sklearn.utils._param_validation import validate_params
Expand All @@ -23,9 +22,11 @@ def rand_binary_similarity(

.. math::

sim(a, b) = \frac{|a \cap b|}{n}
sim(a, b) = \frac{|a+d|}{n}

where `n` is the length of vector `a`.
- :math:`a` - both are 1 (:math:`|x \cap y|`, common "on" bits)
- :math:`d` - both are 0 (:math:`~|x \cap y|`, common "off" bits)
- :math:`n` - length of passed vectors

The calculated similarity falls within the range :math:`[0, 1]`.
Passing all-zero vectors to this function results in a similarity of 0.
Expand Down Expand Up @@ -63,14 +64,14 @@ def rand_binary_similarity(
>>> from skfp.distances import rand_binary_similarity
>>> import numpy as np
>>> vec_a = np.array([1, 0, 1])
>>> vec_b = np.array([1, 0, 1])
>>> vec_b = np.array([1, 0, 0])
>>> sim = rand_binary_similarity(vec_a, vec_b)
>>> sim
0.6666666666666666

>>> from scipy.sparse import csr_array
>>> vec_a = csr_array([[1, 0, 1]])
>>> vec_b = csr_array([[1, 0, 1]])
>>> vec_b = csr_array([[1, 0, 0]])
>>> sim = rand_binary_similarity(vec_a, vec_b)
>>> sim
0.6666666666666666
Expand All @@ -81,16 +82,22 @@ def rand_binary_similarity(
f"got {type(vec_a)} and {type(vec_b)}"
)

if isinstance(vec_a, (np.ndarray, list)):
num_common = np.sum(np.logical_and(vec_a, vec_b))
if isinstance(vec_a, list):
vec_a = np.array(vec_a)
vec_b = np.array(vec_b)

if isinstance(vec_a, np.ndarray):
a = np.sum(np.logical_and(vec_a, vec_b))
d = np.sum(np.logical_and(1 - vec_a, 1 - vec_b)) # type: ignore
length = len(vec_a)
else:
vec_a_idxs = set(vec_a.indices)
vec_b_idxs = set(vec_b.indices)
num_common = len(vec_a_idxs & vec_b_idxs)
length = vec_a.shape[1]
vec_a_idxs = set(vec_a.indices)
vec_b_idxs = set(vec_b.indices) # type: ignore
a = len(vec_a_idxs & vec_b_idxs)
d = length - (vec_a.nnz + vec_b.nnz - a) # type: ignore

rand_sim = num_common / length
rand_sim = (a + d) / length
return float(rand_sim)


Expand Down Expand Up @@ -152,14 +159,14 @@ def rand_binary_distance(
>>> from skfp.distances import rand_binary_distance
>>> import numpy as np
>>> vec_a = np.array([1, 0, 1])
>>> vec_b = np.array([1, 0, 1])
>>> vec_b = np.array([1, 0, 0])
>>> dist = rand_binary_distance(vec_a, vec_b)
>>> dist
0.33333333333333337

>>> from scipy.sparse import csr_array
>>> vec_a = csr_array([[1, 0, 1]])
>>> vec_b = csr_array([[1, 0, 1]])
>>> vec_b = csr_array([[1, 0, 0]])
>>> dist = rand_binary_distance(vec_a, vec_b)
>>> dist
0.33333333333333337
Expand All @@ -168,52 +175,40 @@ def rand_binary_distance(


@validate_params(
{"X": ["array-like"], "Y": ["array-like", None]},
{
"X": ["array-like", csr_array],
"Y": ["array-like", csr_array, None],
},
prefer_skip_nested_validation=True,
)
def bulk_rand_binary_similarity(
X: np.ndarray, Y: np.ndarray | None = None
X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None
) -> np.ndarray:
r"""
Bulk Rand similarity for binary matrices.

Computes the pairwise Rand [1]_ [2]_ (known as All-Bit [3]_ or Sokal-Michener)
similarity between binary matrices. If one array is passed, similarities are
computed between its rows. For two arrays, similarities are between their respective
rows, with `i`-th row and `j`-th column in output corresponding to `i`-th row from
first array and `j`-th row from second array.
Computes the pairwise Rand (also known as All-Bit or Sokal-Michener) similarity
between binary matrices. If one array is passed, similarities are computed between
its rows. For two arrays, similarities are between their respective rows, with
`i`-th row and `j`-th column in output corresponding to `i`-th row from first array
and `j`-th row from second array.

See also :py:func:`rand_binary_similarity`.

Parameters
----------
X : ndarray
First binary input array, of shape :math:`m \times m`
X : ndarray or CSR sparse array
First binary input array, of shape :math:`m \times d`

Y : ndarray, default=None
Second binary input array, of shape :math:`n \times n`. If not passed, similarities
are computed between rows of X.
Y : ndarray or CSR sparse array, default=None
Second binary input array, of shape :math:`n \times d`. If not passed,
similarities are computed between rows of X.

Returns
-------
similarities : ndarray
Array with pairwise Rand similarity values. Shape is :math:`m \times n` if two
arrays are passed, or :math:`m \times m` otherwise.

References
----------
.. [1] `Rand, W.M.
"Objective criteria for the evaluation of clustering methods."
J. Amer. Stat. Assoc. 1971; 66: 846–850.
<https://www.tandfonline.com/doi/abs/10.1080/01621459.1971.10482356>`_

.. [2] `Deza M.M., Deza E.
"Encyclopedia of Distances."
Springer, Berlin, Heidelberg, 2009.
<https://doi.org/10.1007/978-3-642-00234-2_1>`_

.. [3] `RDKit documentation
<https://www.rdkit.org/docs/source/rdkit.DataStructs.cDataStructs.html>`_
Array with pairwise Rand similarity values. Shape is :math:`m \times n`
if two arrays are passed, or :math:`m \times m` otherwise.

See Also
--------
Expand All @@ -227,40 +222,48 @@ def bulk_rand_binary_similarity(
>>> Y = np.array([[1, 0, 1], [0, 1, 1]])
>>> sim = bulk_rand_binary_similarity(X, Y)
>>> sim
array([[0.66666667, 0.33333333],
[0.33333333, 0.33333333]])
array([[1. , 0.33333333],
[0.66666667, 0.66666667]])
"""
if not isinstance(X, csr_array):
X = csr_array(X)

if Y is None:
return _bulk_rand_binary_similarity_single(X)
else:
if not isinstance(Y, csr_array):
Y = csr_array(Y)
return _bulk_rand_binary_similarity_two(X, Y)


@numba.njit(parallel=True)
def _bulk_rand_binary_similarity_single(X: np.ndarray) -> np.ndarray:
m, length = X.shape
sims = np.empty((m, m))
def _bulk_rand_binary_similarity_single(X: csr_array) -> np.ndarray:
n_features = X.shape[1]

for i in numba.prange(m):
for j in numba.prange(i, m):
intersection = np.sum(np.logical_and(X[i], X[j]))
sim = intersection / length
sims[i, j] = sims[j, i] = sim
a = (X @ X.T).toarray()

row_sums = np.asarray(X.sum(axis=1)).ravel()
sum_A = row_sums[:, None]
sum_B = row_sums[None, :]

d = n_features - (sum_A + sum_B - a)

sims = (a + d) / n_features
return sims


@numba.njit(parallel=True)
def _bulk_rand_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
m, length = X.shape
n = Y.shape[0]
sims = np.empty((m, n))
def _bulk_rand_binary_similarity_two(X: csr_array, Y: csr_array) -> np.ndarray:
n_features = X.shape[1]

a = (X @ Y.T).toarray()

row_sums_X = np.asarray(X.sum(axis=1)).ravel()
row_sums_Y = np.asarray(Y.sum(axis=1)).ravel()
sum_A = row_sums_X[:, None]
sum_B = row_sums_Y[None, :]

for i in numba.prange(m):
for j in numba.prange(n):
intersection = np.sum(np.logical_and(X[i], Y[j]))
sims[i, j] = intersection / length
d = n_features - (sum_A + sum_B - a)

sims = (a + d) / n_features
return sims


Expand All @@ -271,51 +274,54 @@ def _bulk_rand_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray
},
prefer_skip_nested_validation=True,
)
def bulk_rand_binary_distance(X: np.ndarray, Y: np.ndarray | None = None) -> np.ndarray:
def bulk_rand_binary_distance(
X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None
) -> np.ndarray:
r"""
Bulk Rand distance for vectors of binary values.

Computes the pairwise Rand distance between binary matrices. If one array is
passed, distances are computed between its rows. For two arrays, distances
are between their respective rows, with `i`-th row and `j`-th column in output
corresponding to `i`-th row from first array and `j`-th row from second array.
Computes the pairwise Rand distance between binary matrices. If one array
is passed, distances are computed between its rows. For two arrays,
distances are between their respective rows, with `i`-th row and `j`-th
column in output corresponding to `i`-th row from first array and `j`-th
row from second array.

See also :py:func:`rand_binary_distance`.

Parameters
----------
X : ndarray
First binary input array, of shape :math:`m \times m`
X : ndarray or CSR sparse array
First binary input array, of shape :math:`m \times d`

Y : ndarray, default=None
Second binary input array, of shape :math:`n \times n`. If not passed, distances
are computed between rows of X.
Y : ndarray or CSR sparse array, default=None
Second binary input array, of shape :math:`n \times d`. If not passed,
distances are computed between rows of X.

Returns
-------
distances : ndarray
Array with pairwise Rand distance values. Shape is :math:`m \times n` if two
arrays are passed, or :math:`m \times m` otherwise.
Array with pairwise Rand distance values. Shape is :math:`m \times n` if
two arrays are passed, or :math:`m \times m` otherwise.

See Also
--------
:py:func:`rand_binary_distance` : Rand distance function for two vectors
:py:func:`rand_binary_distance` : Rand distance function for two vectors.

Examples
--------
>>> from skfp.distances import bulk_rand_binary_distance
>>> import numpy as np
>>> X = np.array([[1, 0, 1], [1, 0, 1]])
>>> Y = np.array([[1, 0, 1], [1, 0, 1]])
>>> Y = np.array([[1, 0, 0], [1, 0, 0]])
>>> dist = bulk_rand_binary_distance(X, Y)
>>> dist
array([[0.33333333, 0.33333333],
[0.33333333, 0.33333333]])

>>> X = np.array([[1, 0, 1], [1, 0, 1]])
>>> X = np.array([[1, 0, 1], [1, 0, 0]])
>>> dist = bulk_rand_binary_distance(X)
>>> dist
array([[0.33333333, 0.33333333],
[0.33333333, 0.33333333]])
array([[0. , 0.33333333],
[0.33333333, 0. ]])
"""
return 1 - bulk_rand_binary_similarity(X, Y)
Loading
Loading