diff --git a/skfp/distances/braun_blanquet.py b/skfp/distances/braun_blanquet.py index f522995a..fccd759d 100644 --- a/skfp/distances/braun_blanquet.py +++ b/skfp/distances/braun_blanquet.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -86,7 +85,7 @@ def braun_blanquet_binary_similarity( max_vec = max(np.sum(vec_a), np.sum(vec_b)) - sim = float(num_common / max_vec) if max_vec != 0 else 1.0 + sim = float(num_common / max_vec) if max_vec != 0 else 1 return sim @@ -163,11 +162,14 @@ def braun_blanquet_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_braun_blanquet_binary_similarity( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Braun-Blanquet similarity for binary matrices. @@ -181,12 +183,12 @@ def bulk_braun_blanquet_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array or sparse matrix, of shape :math:`m \times m` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities - are computed between rows of X. + Y : ndarray or CSR sparse array, default=None + Second binary input array or sparse matrix, of shape :math:`n \times n`. If not passed, + similarities are computed between rows of X. Returns ------- @@ -209,45 +211,39 @@ def bulk_braun_blanquet_binary_similarity( array([[1. , 0.5], [0.5, 0.5]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_braun_blanquet_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_braun_blanquet_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_braun_blanquet_binary_similarity_single(X: np.ndarray) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) - row_sums = np.sum(X, axis=1) +def _bulk_braun_blanquet_binary_similarity_single(X: csr_array) -> np.ndarray: + intersection = (X @ X.T).toarray() + row_sums = np.asarray(X.sum(axis=1)).ravel() + max_denoms = np.maximum.outer(row_sums, row_sums) - for i in numba.prange(m): - sims[i, i] = 1.0 - for j in numba.prange(i + 1, m): - num_common = np.sum(np.logical_and(X[i], X[j])) - max_vec = max(row_sums[i], row_sums[j]) - sim = num_common / max_vec if max_vec != 0 else 0.0 - sims[i, j] = sims[j, i] = sim + sims = np.empty_like(intersection, dtype=float) + np.divide(intersection, max_denoms, out=sims, where=max_denoms != 0) + np.fill_diagonal(sims, 1) return sims -@numba.njit(parallel=True) def _bulk_braun_blanquet_binary_similarity_two( - X: np.ndarray, Y: np.ndarray + X: csr_array, Y: csr_array ) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) - - row_sums_X = np.sum(X, axis=1) - row_sums_Y = np.sum(Y, axis=1) + intersection = (X @ Y.T).toarray() + row_sums_X = np.asarray(X.sum(axis=1)).ravel() + row_sums_Y = np.asarray(Y.sum(axis=1)).ravel() + max_denoms = np.maximum.outer(row_sums_X, row_sums_Y) - for i in numba.prange(m): - for j in numba.prange(n): - num_common = np.sum(np.logical_and(X[i], Y[j])) - max_vec = max(row_sums_X[i], row_sums_Y[j]) - sims[i, j] = num_common / max_vec if max_vec != 0 else 0.0 + sims = np.empty_like(intersection, dtype=float) + np.divide(intersection, max_denoms, out=sims, where=max_denoms != 0) return sims @@ -260,7 +256,7 @@ def _bulk_braun_blanquet_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_braun_blanquet_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Braun-Blanquet distance for vectors of binary values. @@ -274,10 +270,10 @@ def bulk_braun_blanquet_binary_distance( Parameters ---------- - X : ndarray + X : ndarray or CSR sparse array First binary input array, of shape :math:`m \times m` - Y : ndarray, default=None + Y : ndarray or CSR sparse array, default=None Second binary input array, of shape :math:`n \times n`. If not passed, distances are computed between rows of X. diff --git a/skfp/distances/harris_lahey.py b/skfp/distances/harris_lahey.py index 9bbda475..56966a44 100644 --- a/skfp/distances/harris_lahey.py +++ b/skfp/distances/harris_lahey.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -127,7 +126,7 @@ def harris_lahey_binary_similarity( # all-ones or all-zeros vectors if first_denom == 0 or second_denom == 0: - return 1.0 + return 1 sim = float( (a * (2 * d + b + c)) / (2 * first_denom) @@ -218,12 +217,15 @@ def harris_lahey_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_harris_lahey_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, + Y: list | np.ndarray | csr_array | None = None, normalized: bool = False, ) -> np.ndarray: r""" @@ -238,15 +240,15 @@ def bulk_harris_lahey_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, similarities are computed between rows of X. normalized : bool, default=False - Whether to divide the resulting similarity by length of vectors, (their number + Whether to divide the resulting similarity by length of vectors (their number of elements), to normalize values to range ``[0, 1]``. Returns @@ -263,98 +265,89 @@ def bulk_harris_lahey_binary_similarity( -------- >>> from skfp.distances import bulk_harris_lahey_binary_similarity >>> import numpy as np - >>> X = np.array([[1, 0, 1], [0, 0, 1]]) - >>> Y = np.array([[1, 0, 1], [0, 1, 1]]) + >>> from scipy.sparse import csr_array + >>> X = csr_array([[1, 0, 1], [0, 0, 1]]) + >>> Y = csr_array([[1, 0, 1], [0, 1, 1]]) >>> sim = bulk_harris_lahey_binary_similarity(X, Y) >>> sim array([[3. , 0.33333333], [1.5 , 1.5 ]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: - return _bulk_harris_lahey_binary_similarity_single(X, normalized) + return _bulk_harris_lahey_binary_similarity_single_sparse(X, normalized) else: - return _bulk_harris_lahey_binary_similarity_two(X, Y, normalized) + if not isinstance(Y, csr_array): + Y = csr_array(Y) + return _bulk_harris_lahey_binary_similarity_two_sparse(X, Y, normalized) -@numba.njit(parallel=True) -def _bulk_harris_lahey_binary_similarity_single( - X: np.ndarray, normalized: bool +def _bulk_harris_lahey_binary_similarity_single_sparse( + X: csr_array, normalized: bool ) -> np.ndarray: m, length = X.shape - sims = np.empty((m, m)) + row_sums = np.asarray(X.sum(axis=1)).ravel() + a = (X @ X.T).toarray() + bc_sum = np.add.outer(row_sums, row_sums) - 2 * a + d = length - (a + bc_sum) - for i in numba.prange(m): - vec_a = X[i] - vec_a_neg = 1 - vec_a + first_denom = a + bc_sum + second_denom = bc_sum + d - for j in numba.prange(i, m): - vec_b = X[j] - vec_b_neg = 1 - vec_b + sims = np.empty_like(a, dtype=float) - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) - d = np.sum(np.logical_and(vec_a_neg, vec_b_neg)) + with np.errstate(divide="ignore", invalid="ignore"): + first_num = a * (2 * d + bc_sum) + first_denom = 2 * np.where(first_denom != 0, first_denom, 1) + part_1 = first_num / first_denom - bc_sum = b + c + second_num = d * (2 * a + bc_sum) + second_denom = 2 * np.where(second_denom != 0, second_denom, 1) + part_2 = second_num / second_denom - first_denom = a + bc_sum - second_denom = bc_sum + d + sims = part_1 + part_2 - # all-ones or all-zeros vectors - if first_denom == 0 or second_denom == 0: - sim = 1.0 - else: - sim = float( - (a * (2 * d + bc_sum)) / (2 * first_denom) - + (d * (2 * a + bc_sum)) / (2 * second_denom) - ) - if normalized: - sim /= length + # handle all-zeros / all-ones vectors + sims[(first_denom == 0) | (second_denom == 0)] = 1 - sims[i, j] = sims[j, i] = sim + if normalized: + sims /= length return sims -@numba.njit(parallel=True) -def _bulk_harris_lahey_binary_similarity_two( - X: np.ndarray, Y: np.ndarray, normalized: bool +def _bulk_harris_lahey_binary_similarity_two_sparse( + X: csr_array, Y: csr_array, normalized: bool ) -> np.ndarray: m, length = X.shape - n = Y.shape[0] - sims = np.empty((m, n)) + row_sums_X = np.asarray(X.sum(axis=1)).ravel() + row_sums_Y = np.asarray(Y.sum(axis=1)).ravel() + a = (X @ Y.T).toarray() + bc_sum = np.add.outer(row_sums_X, row_sums_Y) - 2 * a + d = length - (a + bc_sum) - for i in numba.prange(m): - vec_a = X[i] - vec_a_neg = 1 - vec_a + first_denom = a + bc_sum + second_denom = bc_sum + d - for j in numba.prange(n): - vec_b = Y[j] - vec_b_neg = 1 - vec_b + sims = np.empty_like(a, dtype=float) - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) - d = np.sum(np.logical_and(vec_a_neg, vec_b_neg)) + with np.errstate(divide="ignore", invalid="ignore"): + first_num = a * (2 * d + bc_sum) + first_denom = 2 * np.where(first_denom != 0, first_denom, 1) + part_1 = first_num / first_denom - bc_sum = b + c + second_num = d * (2 * a + bc_sum) + second_denom = 2 * np.where(second_denom != 0, second_denom, 1) + part_2 = second_num / second_denom - first_denom = a + bc_sum - second_denom = bc_sum + d + sims = part_1 + part_2 - # all-ones or all-zeros vectors - if first_denom == 0 or second_denom == 0: - sim = 1.0 - else: - sim = float( - (a * (2 * d + bc_sum)) / (2 * first_denom) - + (d * (2 * a + bc_sum)) / (2 * second_denom) - ) - if normalized: - sim /= length + sims[(first_denom == 0) | (second_denom == 0)] = 1 - sims[i, j] = sim + if normalized: + sims /= length return sims @@ -367,7 +360,7 @@ def _bulk_harris_lahey_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_harris_lahey_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Harris-Lahey distance for vectors of binary values. @@ -381,11 +374,11 @@ def bulk_harris_lahey_binary_distance( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, distances are computed between rows of X. Returns @@ -397,22 +390,5 @@ def bulk_harris_lahey_binary_distance( See Also -------- :py:func:`harris_lahey_binary_distance` : Harris-Lahey distance function for two vectors - - Examples - -------- - >>> from skfp.distances import bulk_harris_lahey_binary_distance - >>> import numpy as np - >>> X = np.array([[1, 0, 1], [1, 0, 1]]) - >>> Y = np.array([[1, 0, 1], [1, 0, 1]]) - >>> dist = bulk_harris_lahey_binary_distance(X, Y) - >>> dist - array([[0., 0.], - [0., 0.]]) - - >>> X = np.array([[1, 0, 1], [1, 0, 1]]) - >>> dist = bulk_harris_lahey_binary_distance(X) - >>> dist - array([[0., 0.], - [0., 0.]]) """ return 1 - bulk_harris_lahey_binary_similarity(X, Y, normalized=True) diff --git a/skfp/distances/kulczynski.py b/skfp/distances/kulczynski.py index 2550a65f..d486196b 100644 --- a/skfp/distances/kulczynski.py +++ b/skfp/distances/kulczynski.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -91,7 +90,7 @@ def kulczynski_binary_similarity( ) if np.sum(vec_a) == 0 == np.sum(vec_b): - return 1.0 + return 1 if isinstance(vec_a, list): vec_a = np.array(vec_a) @@ -110,7 +109,7 @@ def kulczynski_binary_similarity( c = len(vec_b_idxs - vec_a_idxs) if a + b == 0 or a + c == 0: - return 0.0 + return 0 sim = (a / (a + b) + a / (a + c)) / 2 return float(sim) @@ -190,12 +189,14 @@ def kulczynski_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_kulczynski_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Kulczynski similarity for binary matrices. @@ -209,11 +210,11 @@ def bulk_kulczynski_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, similarities are computed between rows of X. Returns @@ -237,87 +238,61 @@ def bulk_kulczynski_binary_similarity( array([[1. , 0.5 ], [0.75, 0.75]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_kulczynski_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_kulczynski_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_kulczynski_binary_similarity_single( - X: np.ndarray, -) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) - X_sum = np.sum(X, axis=1) - - # upper triangle - actual similarities - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - vec_a_neg = 1 - vec_a - sims[i, i] = 1.0 - - for j in numba.prange(i + 1, m): - vec_b = X[j] - sum_b = X_sum[j] +def _bulk_kulczynski_binary_similarity_single(X: csr_array) -> np.ndarray: + # formula: 0.5 * (a/a+b + a/a+c) # noqa: ERA001 + # note that: + # a = |A & B|, b = |A - B|, a+b = |A| + # c = |B - A|, a+c = |B| + # we can rewrite formula row-wise as: 0.5 * (a / |A| + a / |B|) - if sum_a == 0 == sum_b: - sims[i, j] = sims[j, i] = 1.0 - continue + intersection = (X @ X.T).toarray() # a = |A and B| + row_sums = np.asarray(X.sum(axis=1)).ravel() - vec_b_neg = 1 - vec_b + denom_A = row_sums[:, None] # |A| + denom_B = row_sums[None, :] # |B| - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) + term_A = np.zeros_like(intersection, dtype=float) + term_B = np.zeros_like(intersection, dtype=float) - if a + b == 0 or a + c == 0: - sims[i, j] = sims[j, i] = 0.0 - continue + np.divide(intersection, denom_A, out=term_A, where=denom_A != 0) + np.divide(intersection, denom_B, out=term_B, where=denom_B != 0) + sims = 0.5 * (term_A + term_B) - sim = (a / (a + b) + a / (a + c)) / 2.0 - sims[i, j] = sims[j, i] = sim + both_zero = (denom_A == 0) & (denom_B == 0) + sims[both_zero] = 1 + np.fill_diagonal(sims, 1) return sims -@numba.njit(parallel=True) -def _bulk_kulczynski_binary_similarity_two( - X: np.ndarray, - Y: np.ndarray, -) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) - X_sum = np.sum(X, axis=1) - Y_sum = np.sum(Y, axis=1) - - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - vec_a_neg = 1 - vec_a - - for j in numba.prange(n): - vec_b = Y[j] - sum_b = Y_sum[j] - - if sum_a == 0 == sum_b: - sims[i, j] = 1.0 - continue +def _bulk_kulczynski_binary_similarity_two(X: csr_array, Y: csr_array) -> np.ndarray: + intersection = (X @ Y.T).toarray() + row_sums_X = np.asarray(X.sum(axis=1)).ravel() + row_sums_Y = np.asarray(Y.sum(axis=1)).ravel() - # no need to compute vec_b_neg if sum_a == 0 == sum_b - vec_b_neg = 1 - vec_b + denom_A = row_sums_X[:, None] + denom_B = row_sums_Y[None, :] - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) + term_A = np.zeros_like(intersection, dtype=float) + term_B = np.zeros_like(intersection, dtype=float) - if a + b == 0 or a + c == 0: - sims[i, j] = 0.0 - continue + np.divide(intersection, denom_A, out=term_A, where=denom_A != 0) + np.divide(intersection, denom_B, out=term_B, where=denom_B != 0) + sims = 0.5 * (term_A + term_B) - sims[i, j] = (a / (a + b) + a / (a + c)) / 2.0 + both_zero = (denom_A == 0) & (denom_B == 0) + sims[both_zero] = 1 return sims @@ -330,7 +305,7 @@ def _bulk_kulczynski_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_kulczynski_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Kulczynski distance for vectors of binary values. @@ -344,11 +319,11 @@ def bulk_kulczynski_binary_distance( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, distances are computed between rows of X. Returns diff --git a/skfp/distances/mcconnaughey.py b/skfp/distances/mcconnaughey.py index 0d160094..2c9333f5 100644 --- a/skfp/distances/mcconnaughey.py +++ b/skfp/distances/mcconnaughey.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -191,12 +190,15 @@ def mcconnaughey_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_mcconnaughey_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, + Y: list | np.ndarray | csr_array | None = None, normalized: bool = False, ) -> np.ndarray: r""" @@ -211,11 +213,11 @@ def bulk_mcconnaughey_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, similarities are computed between rows of X. normalized : bool, default=False @@ -243,82 +245,68 @@ def bulk_mcconnaughey_binary_similarity( array([[0.66666667, 0.66666667], [0.5 , 0.5 ]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_mcconnaughey_binary_similarity_single(X, normalized) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_mcconnaughey_binary_similarity_two(X, Y, normalized) -@numba.njit(parallel=True) def _bulk_mcconnaughey_binary_similarity_single( - X: np.ndarray, - normalized: bool, + X: csr_array, normalized: bool ) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) - X_sum = np.sum(X, axis=1) - - # upper triangle - actual similarities - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - - for j in numba.prange(i, m): - vec_b = X[j] - sum_b = X_sum[j] - - num_common = np.sum(np.logical_and(vec_a, vec_b)) - sum_ab = sum_a + sum_b - dot_ab = sum_a * sum_b - - if sum_ab == 0: - sim = 1.0 - elif dot_ab == 0: - sim = -1.0 - else: - sim = (num_common * sum_ab - dot_ab) / dot_ab - if normalized: - sim = (sim + 1) / 2 - - sims[i, j] = sims[j, i] = sim + intersection = (X @ X.T).toarray() + row_sums = np.asarray(X.sum(axis=1)).ravel() + + denom_A = row_sums[:, None] + denom_B = row_sums[None, :] + + term_A = np.zeros_like(intersection, dtype=float) + term_B = np.zeros_like(intersection, dtype=float) + + np.divide(intersection, denom_A, out=term_A, where=denom_A != 0) + np.divide(intersection, denom_B, out=term_B, where=denom_B != 0) + + sims = term_A + term_B - 1 + + both_zero = (denom_A == 0) & (denom_B == 0) + sims[both_zero] = 1 + if normalized: + sims = (sims + 1) / 2 + + np.fill_diagonal(sims, 1) return sims -@numba.njit(parallel=True) def _bulk_mcconnaughey_binary_similarity_two( - X: np.ndarray, - Y: np.ndarray, - normalized: bool, + X: csr_array, Y: csr_array, normalized: bool ) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) - X_sum = np.sum(X, axis=1) - Y_sum = np.sum(Y, axis=1) - - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - - for j in numba.prange(n): - vec_b = Y[j] - sum_b = Y_sum[j] - - num_common = np.sum(np.logical_and(vec_a, vec_b)) - sum_ab = sum_a + sum_b - dot_ab = sum_a * sum_b - - if sum_ab == 0: - sim = 1.0 - elif dot_ab == 0: - sim = -1.0 - else: - sim = (num_common * sum_ab - dot_ab) / dot_ab - if normalized: - sim = (sim + 1) / 2 - - sims[i, j] = sim + intersection = (X @ Y.T).toarray() + + row_sums_X = np.asarray(X.sum(axis=1)).ravel() + row_sums_Y = np.asarray(Y.sum(axis=1)).ravel() + + denom_A = row_sums_X[:, None] + denom_B = row_sums_Y[None, :] + + term_A = np.zeros_like(intersection, dtype=float) + term_B = np.zeros_like(intersection, dtype=float) + + np.divide(intersection, denom_A, out=term_A, where=denom_A != 0) + np.divide(intersection, denom_B, out=term_B, where=denom_B != 0) + + sims = term_A + term_B - 1 + + both_zero = (denom_A == 0) & (denom_B == 0) + sims[both_zero] = 1 + + if normalized: + sims = (sims + 1) / 2 return sims @@ -331,7 +319,7 @@ def _bulk_mcconnaughey_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_mcconnaughey_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk McConnaughey distance for vectors of binary values. @@ -345,11 +333,11 @@ def bulk_mcconnaughey_binary_distance( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, distances are computed between rows of X. Returns @@ -360,7 +348,7 @@ def bulk_mcconnaughey_binary_distance( See Also -------- - :py:func:`mcconnaughey_binary_distance` : McConnaughey distance function for two vectors + :py:func:`mcconnaughey_binary_distance` : McConnaughey distance function for two vectors. Examples -------- diff --git a/skfp/distances/rogot_goldberg.py b/skfp/distances/rogot_goldberg.py index 63565358..ad5f7681 100644 --- a/skfp/distances/rogot_goldberg.py +++ b/skfp/distances/rogot_goldberg.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -112,7 +111,7 @@ def rogot_goldberg_binary_similarity( # all-ones or all-zeros vectors if first_denom == 0 or second_denom == 0: - return 1.0 + return 1 sim = a / first_denom + d / second_denom @@ -193,12 +192,15 @@ def rogot_goldberg_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_rogot_goldberg_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, + Y: list | np.ndarray | csr_array | None = None, ) -> np.ndarray: r""" Bulk Rogot-Goldberg similarity for binary matrices. @@ -212,11 +214,11 @@ def bulk_rogot_goldberg_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, similarities are computed between rows of X. Returns @@ -240,78 +242,76 @@ def bulk_rogot_goldberg_binary_similarity( array([[0.4 , 0.4 ], [0.66666667, 0.66666667]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_rogot_goldberg_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_rogot_goldberg_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_rogot_goldberg_binary_similarity_single( - X: np.ndarray, -) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) +def _bulk_rogot_goldberg_binary_similarity_single(X: csr_array) -> np.ndarray: + n_features = X.shape[1] - # upper triangle - actual similarities - for i in numba.prange(m): - vec_a = X[i] - vec_a_neg = 1 - vec_a - sims[i, i] = 1.0 + # a - intersection + a = (X @ X.T).toarray() + row_sums = np.asarray(X.sum(axis=1)).ravel() + + # b+c = |A| + |B| - 2a + sum_A = row_sums[:, None] + sum_B = row_sums[None, :] + bc_sum = sum_A + sum_B - 2 * a - for j in numba.prange(i + 1, m): - vec_b = X[j] - vec_b_neg = 1 - vec_b + # d = n - (a + b + c) # noqa: ERA001 + d = n_features - (a + bc_sum) - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) - d = np.sum(np.logical_and(vec_a_neg, vec_b_neg)) + denom_1 = 2 * a + bc_sum + denom_2 = 2 * d + bc_sum - first_denom = 2 * a + b + c - second_denom = 2 * d + b + c + sims = np.zeros_like(a, dtype=float) + np.divide(a, denom_1, out=sims, where=denom_1 != 0) - if first_denom == 0 or second_denom == 0: - sim = 1.0 - else: - sim = a / first_denom + d / second_denom + part_2 = np.zeros_like(a, dtype=float) + np.divide(d, denom_2, out=part_2, where=denom_2 != 0) + sims += part_2 - sims[i, j] = sims[j, i] = sim + denom_zero = (denom_1 == 0) | (denom_2 == 0) + sims[denom_zero] = 1 return sims -@numba.njit(parallel=True) def _bulk_rogot_goldberg_binary_similarity_two( - X: np.ndarray, - Y: np.ndarray, + X: csr_array, Y: csr_array ) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) + n_features = X.shape[1] - for i in numba.prange(m): - vec_a = X[i] - vec_a_neg = 1 - vec_a + a = (X @ Y.T).toarray() + + row_sums_X = np.asarray(X.sum(axis=1)).ravel() + row_sums_Y = np.asarray(Y.sum(axis=1)).ravel() + + sum_A = row_sums_X[:, None] + sum_B = row_sums_Y[None, :] + bc = sum_A + sum_B - 2 * a + d = n_features - (a + bc) - for j in numba.prange(n): - vec_b = Y[j] - vec_b_neg = 1 - vec_b + denom_1 = 2 * a + bc + denom_2 = 2 * d + bc - a = np.sum(np.logical_and(vec_a, vec_b)) - b = np.sum(np.logical_and(vec_a, vec_b_neg)) - c = np.sum(np.logical_and(vec_a_neg, vec_b)) - d = np.sum(np.logical_and(vec_a_neg, vec_b_neg)) + sims = np.zeros_like(a, dtype=float) + np.divide(a, denom_1, out=sims, where=denom_1 != 0) - first_denom = 2 * a + b + c - second_denom = 2 * d + b + c + part_2 = np.zeros_like(a, dtype=float) + np.divide(d, denom_2, out=part_2, where=denom_2 != 0) - if first_denom == 0 or second_denom == 0: - sims[i, j] = 1.0 - continue + sims += part_2 - sim = a / first_denom + d / second_denom - sims[i, j] = sim + denom_zero = (denom_1 == 0) | (denom_2 == 0) + sims[denom_zero] = 1 return sims @@ -324,7 +324,7 @@ def _bulk_rogot_goldberg_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_rogot_goldberg_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Rogot-Goldberg distance for vectors of binary values. @@ -338,11 +338,11 @@ def bulk_rogot_goldberg_binary_distance( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, distances are computed between rows of X. Returns diff --git a/skfp/distances/russell.py b/skfp/distances/russell.py index c4eb4df5..639a1a78 100644 --- a/skfp/distances/russell.py +++ b/skfp/distances/russell.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -27,7 +26,7 @@ def russell_binary_similarity( where - - :math:`a` - common "on" bits + - :math:`a` - both are 1 (:math:`|x \cap y|`, common "on" bits) - :math:`n` - length of passed vectors The calculated similarity falls within the range :math:`[0, 1]`. @@ -35,10 +34,10 @@ def russell_binary_similarity( Parameters ---------- - vec_a : {ndarray, sparse matrix} + vec_a : {ndarray, CSR sparse array} First binary input array or sparse matrix. - vec_b : {ndarray, sparse matrix} + vec_b : {ndarray, CSR sparse array} Second binary input array or sparse matrix. Returns @@ -91,12 +90,9 @@ def russell_binary_similarity( n = vec_a.shape[1] vec_a_idxs = set(vec_a.indices) vec_b_idxs = set(vec_b.indices) - a = len(vec_a_idxs & vec_b_idxs) - sim = a / n - - return float(sim) + return float(a / n) @validate_params( @@ -126,10 +122,10 @@ def russell_binary_distance( Parameters ---------- - vec_a : {ndarray, sparse matrix} + vec_a : {ndarray, CSR sparse array} First binary input array or sparse matrix. - vec_b : {ndarray, sparse matrix} + vec_b : {ndarray, CSR sparse array} Second binary input array or sparse matrix. Returns @@ -173,12 +169,15 @@ def russell_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_russell_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, + Y: list | np.ndarray | csr_array | None = None, ) -> np.ndarray: r""" Bulk Russell similarity for binary matrices. @@ -192,11 +191,11 @@ def bulk_russell_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d` - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, similarities are computed between rows of X. Returns @@ -204,64 +203,29 @@ def bulk_russell_binary_similarity( similarities : ndarray Array with pairwise Russell similarity values. Shape is :math:`m \times n` if two arrays are passed, or :math:`m \times m` otherwise. - - See Also - -------- - :py:func:`russell_binary_similarity` : Russell similarity function for two vectors. - - Examples - -------- - >>> from skfp.distances import bulk_russell_binary_similarity - >>> import numpy as np - >>> X = np.array([[1, 1, 1], [0, 0, 1]]) - >>> Y = np.array([[1, 0, 1], [0, 1, 1]]) - >>> sim = bulk_russell_binary_similarity(X, Y) - >>> sim - array([[0.66666667, 0.66666667], - [0.33333333, 0.33333333]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_russell_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_russell_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_russell_binary_similarity_single( - X: np.ndarray, -) -> np.ndarray: - m, length = X.shape - sims = np.empty((m, m)) - - # upper triangle - actual similarities - for i in numba.prange(m): - vec_a = X[i] - for j in numba.prange(i, m): - vec_b = X[j] - a = np.sum(np.logical_and(vec_a, vec_b)) - sim = a / length - sims[i, j] = sims[j, i] = sim - +def _bulk_russell_binary_similarity_single(X: csr_array) -> np.ndarray: + n_features = X.shape[1] + a = (X @ X.T).toarray() + sims = a / n_features return sims -@numba.njit(parallel=True) -def _bulk_russell_binary_similarity_two( - X: np.ndarray, - Y: np.ndarray, -) -> np.ndarray: - m, length = X.shape - n = Y.shape[0] - sims = np.empty((m, n)) - - for i in numba.prange(m): - vec_a = X[i] - for j in numba.prange(n): - vec_b = Y[j] - a = np.sum(np.logical_and(vec_a, vec_b)) - sim = a / length - sims[i, j] = sim - +def _bulk_russell_binary_similarity_two(X: csr_array, Y: csr_array) -> np.ndarray: + n_features = X.shape[1] + a = (X @ Y.T).toarray() + sims = a / n_features return sims @@ -273,7 +237,7 @@ def _bulk_russell_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_russell_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Russell distance for vectors of binary values. diff --git a/skfp/distances/simpson.py b/skfp/distances/simpson.py index 615fdae5..ba30c99c 100644 --- a/skfp/distances/simpson.py +++ b/skfp/distances/simpson.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -90,7 +89,7 @@ def simpson_binary_similarity( min_vec = min(np.sum(vec_a), np.sum(vec_b)) - sim = num_common / min_vec if min_vec != 0 else 0.0 + sim = num_common / min_vec if min_vec != 0 else 0 return float(sim) @@ -171,18 +170,21 @@ def simpson_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_simpson_binary_similarity( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" Bulk Simpson similarity for binary matrices. - Computes the pairwise Simpson [1]_ (also known as asymmetric similarity [2]_ [3]_ - or overlap coefficient [4]_) similarity between binary matrices. If one array is - passed, similarities are computed between its rows. For two arrays, similarities + Computes the pairwise Simpson (also known as asymmetric similarity or overlap + coefficient) similarity between binary matrices. If one array is passed, + similarities are computed between its rows. For two arrays, similarities are between their respective rows, with `i`-th row and `j`-th column in output corresponding to `i`-th row from first array and `j`-th row from second array. @@ -190,40 +192,18 @@ def bulk_simpson_binary_similarity( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d`. - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities - are computed between rows of X. + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, + similarities are computed between rows of X. Returns ------- similarities : ndarray - Array with pairwise Simpson similarity values. Shape is :math:`m \times n` if two - arrays are passed, or :math:`m \times m` otherwise. - - References - ---------- - .. [1] `Simpson, G.G. - "Mammals and the nature of continents." - American Journal of Science, 241: 1-31 (1943). - `_ - - .. [2] `Deza M.M., Deza E. - "Encyclopedia of Distances." - Springer, Berlin, Heidelberg, 2009. - `_ - - .. [3] `RDKit documentation - `_ - - .. [4] `Overlap coefficient on Wikipedia - `_ - - See Also - -------- - :py:func:`simpson_binary_similarity` : Simpson similarity function for two vectors. + Array with pairwise Simpson similarity values. Shape is :math:`m \times n` + if two arrays are passed, or :math:`m \times m` otherwise. Examples -------- @@ -233,46 +213,50 @@ def bulk_simpson_binary_similarity( >>> Y = np.array([[1, 0, 1], [0, 1, 1]]) >>> sim = bulk_simpson_binary_similarity(X, Y) >>> sim + array([[1. , 0.5], + [1. , 1. ]]) + + >>> from scipy.sparse import csr_array + >>> X = csr_array([[1, 0, 1], [0, 0, 1]]) + >>> Y = csr_array([[1, 0, 1], [0, 1, 1]]) + >>> sim = bulk_simpson_binary_similarity(X, Y) + >>> sim array([[1. , 0.5], [1. , 1. ]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_simpson_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_simpson_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_simpson_binary_similarity_single(X: np.ndarray) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) - row_sums = np.sum(X, axis=1) +def _bulk_simpson_binary_similarity_single(X: csr_array) -> np.ndarray: + intersection = (X @ X.T).toarray() + row_sums = np.array(X.sum(axis=1)).ravel() + denom = np.minimum.outer(row_sums, row_sums) - for i in numba.prange(m): - for j in numba.prange(i, m): - num_common = np.sum(np.logical_and(X[i], X[j])) - min_vec = min(row_sums[i], row_sums[j]) - sim = num_common / min_vec if min_vec != 0 else 0.0 - sims[i, j] = sims[j, i] = sim + with np.errstate(divide="ignore", invalid="ignore"): + sims = np.divide(intersection, denom, where=denom != 0) + sims[denom == 0] = 0 return sims -@numba.njit(parallel=True) -def _bulk_simpson_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) +def _bulk_simpson_binary_similarity_two(X: csr_array, Y: csr_array) -> np.ndarray: + intersection = (X @ Y.T).toarray() + row_sums_X = np.array(X.sum(axis=1)).ravel() + row_sums_Y = np.array(Y.sum(axis=1)).ravel() + denom = np.minimum(row_sums_X[:, None], row_sums_Y[None, :]) - row_sums_X = np.sum(X, axis=1) - row_sums_Y = np.sum(Y, axis=1) - - for i in numba.prange(m): - for j in numba.prange(n): - num_common = np.sum(np.logical_and(X[i], Y[j])) - min_vec = min(row_sums_X[i], row_sums_Y[j]) - sims[i, j] = num_common / min_vec if min_vec != 0 else 0.0 + with np.errstate(divide="ignore", invalid="ignore"): + sims = np.divide(intersection, denom, where=denom != 0) + sims[denom == 0] = 0 return sims @@ -284,13 +268,13 @@ def _bulk_simpson_binary_similarity_two(X: np.ndarray, Y: np.ndarray) -> np.ndar prefer_skip_nested_validation=True, ) def bulk_simpson_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" - Bulk Simpson distance for vectors of binary values. + Bulk Simpson distance for binary matrices. - Computes the pairwise Simpson distance between binary matrices. If one array is - passed, distances are computed between its rows. For two arrays, distances + Computes the pairwise Simpson distance between binary matrices. If one array + is passed, distances are computed between its rows. For two arrays, distances are between their respective rows, with `i`-th row and `j`-th column in output corresponding to `i`-th row from first array and `j`-th row from second array. @@ -298,35 +282,31 @@ def bulk_simpson_binary_distance( Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d`. - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances - are computed between rows of X. + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, + distances are computed between rows of X. Returns ------- distances : ndarray - Array with pairwise Simpson distance values. Shape is :math:`m \times n` if two - arrays are passed, or :math:`m \times m` otherwise. - - See Also - -------- - :py:func:`simpson_binary_distance` : Simpson distance function for two vectors + Array with pairwise Simpson distance values. Shape is :math:`m \times n` + if two arrays are passed, or :math:`m \times m` otherwise. Examples -------- >>> from skfp.distances import bulk_simpson_binary_distance >>> import numpy as np >>> X = np.array([[1, 0, 1], [1, 0, 1]]) - >>> Y = np.array([[1, 0, 1], [1, 0, 1]]) - >>> dist = bulk_simpson_binary_distance(X, Y) + >>> dist = bulk_simpson_binary_distance(X) >>> dist array([[0., 0.], [0., 0.]]) - >>> X = np.array([[1, 0, 1], [1, 0, 1]]) + >>> from scipy.sparse import csr_array + >>> X = csr_array([[1, 0, 1], [1, 0, 1]]) >>> dist = bulk_simpson_binary_distance(X) >>> dist array([[0., 0.], diff --git a/skfp/distances/sokal_sneath.py b/skfp/distances/sokal_sneath.py index 327c33d2..8d19f8fb 100644 --- a/skfp/distances/sokal_sneath.py +++ b/skfp/distances/sokal_sneath.py @@ -1,4 +1,3 @@ -import numba import numpy as np from scipy.sparse import csr_array from sklearn.utils._param_validation import validate_params @@ -175,41 +174,46 @@ def sokal_sneath_2_binary_distance( @validate_params( - {"X": ["array-like"], "Y": ["array-like", None]}, + { + "X": ["array-like", csr_array], + "Y": ["array-like", csr_array, None], + }, prefer_skip_nested_validation=True, ) def bulk_sokal_sneath_2_binary_similarity( - X: np.ndarray, - Y: np.ndarray | None = None, + X: list | np.ndarray | csr_array, + Y: list | np.ndarray | csr_array | None = None, ) -> np.ndarray: r""" Bulk Sokal-Sneath similarity 2 for binary matrices. - Computes the pairwise Sokal-Sneath similarity 2 between binary matrices. If one array is - passed, similarities are computed between its rows. For two arrays, similarities - are between their respective rows, with `i`-th row and `j`-th column in output - corresponding to `i`-th row from first array and `j`-th row from second array. + Computes the pairwise Sokal-Sneath similarity 2 between binary matrices. + If one array is passed, similarities are computed between its rows. + For two arrays, similarities are between their respective rows, with + `i`-th row and `j`-th column in output corresponding to `i`-th row from the + first array and `j`-th row from the second array. - See also :py:func:`sokal_sneath_2_binary_similarity`. + The formula is: + + .. math:: + + sim(a, b) = \frac{|a \cap b|}{2|a| + 2|b| - 3|a \cap b|} Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d`. - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, similarities - are computed between rows of X. + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, + similarities are computed between rows of X. Returns ------- similarities : ndarray - Array with pairwise Sokal-Sneath similarity 2 values. Shape is :math:`m \times n` if two - arrays are passed, or :math:`m \times m` otherwise. - - See Also - -------- - :py:func:`sokal_sneath_2_binary_similarity` : Sokal-Sneath similarity 2 function for two vectors. + Array with pairwise Sokal-Sneath similarity 2 values. Shape is + :math:`m \times n` if two arrays are passed, or :math:`m \times m` + otherwise. Examples -------- @@ -217,70 +221,60 @@ def bulk_sokal_sneath_2_binary_similarity( >>> import numpy as np >>> X = np.array([[1, 1, 1], [0, 0, 1]]) >>> Y = np.array([[1, 0, 1], [0, 1, 1]]) - >>> sim = bulk_sokal_sneath_2_binary_similarity(X, Y) - >>> sim + >>> bulk_sokal_sneath_2_binary_similarity(X, Y) + array([[0.5 , 0.5 ], + [0.33333333, 0.33333333]]) + + >>> from scipy.sparse import csr_array + >>> X = csr_array([[1, 1, 1], [0, 0, 1]]) + >>> Y = csr_array([[1, 0, 1], [0, 1, 1]]) + >>> bulk_sokal_sneath_2_binary_similarity(X, Y) array([[0.5 , 0.5 ], [0.33333333, 0.33333333]]) """ + if not isinstance(X, csr_array): + X = csr_array(X) + if Y is None: return _bulk_sokal_sneath_2_binary_similarity_single(X) else: + if not isinstance(Y, csr_array): + Y = csr_array(Y) return _bulk_sokal_sneath_2_binary_similarity_two(X, Y) -@numba.njit(parallel=True) -def _bulk_sokal_sneath_2_binary_similarity_single( - X: np.ndarray, -) -> np.ndarray: - m = X.shape[0] - sims = np.empty((m, m)) - X_sum = np.sum(X, axis=1) - - # upper triangle - actual similarities - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - sims[i, i] = 1.0 +def _bulk_sokal_sneath_2_binary_similarity_single(X: csr_array) -> np.ndarray: + intersection = (X @ X.T).toarray() + row_sums = np.array(X.sum(axis=1)).ravel() - for j in numba.prange(i + 1, m): - vec_b = X[j] - sum_b = X_sum[j] + sum_A = row_sums[:, None] + sum_B = row_sums[None, :] + denominator = 2 * sum_A + 2 * sum_B - 3 * intersection - intersection = np.sum(np.logical_and(vec_a, vec_b)) + with np.errstate(divide="ignore", invalid="ignore"): + sims = np.divide(intersection, denominator, where=denominator > 0) - denominator = 2 * sum_a + 2 * sum_b - 3 * intersection - sim = intersection / denominator if denominator > 0 else 1.0 - - sims[i, j] = sims[j, i] = sim + sims[denominator == 0] = 1 + np.fill_diagonal(sims, 1) return sims -@numba.njit(parallel=True) def _bulk_sokal_sneath_2_binary_similarity_two( - X: np.ndarray, - Y: np.ndarray, + X: csr_array, Y: csr_array ) -> np.ndarray: - m = X.shape[0] - n = Y.shape[0] - sims = np.empty((m, n)) - X_sum = np.sum(X, axis=1) - Y_sum = np.sum(Y, axis=1) - - for i in numba.prange(m): - vec_a = X[i] - sum_a = X_sum[i] - - for j in numba.prange(n): - vec_b = Y[j] - sum_b = Y_sum[j] + intersection = (X @ Y.T).toarray() + row_sums_X = np.array(X.sum(axis=1)).ravel() + row_sums_Y = np.array(Y.sum(axis=1)).ravel() - intersection = np.sum(np.logical_and(vec_a, vec_b)) + sum_A = row_sums_X[:, None] + sum_B = row_sums_Y[None, :] + denominator = 2 * sum_A + 2 * sum_B - 3 * intersection - denominator = 2 * sum_a + 2 * sum_b - 3 * intersection - sim = intersection / denominator if denominator > 0 else 1.0 + with np.errstate(divide="ignore", invalid="ignore"): + sims = np.divide(intersection, denominator, where=denominator > 0) - sims[i, j] = sim + sims[denominator == 0] = 1 return sims @@ -293,36 +287,34 @@ def _bulk_sokal_sneath_2_binary_similarity_two( prefer_skip_nested_validation=True, ) def bulk_sokal_sneath_2_binary_distance( - X: np.ndarray, Y: np.ndarray | None = None + X: list | np.ndarray | csr_array, Y: list | np.ndarray | csr_array | None = None ) -> np.ndarray: r""" - Bulk Sokal-Sneath distance 2 for vectors of binary values. + Bulk Sokal-Sneath distance 2 for binary matrices. - Computes the pairwise Sokal-Sneath distance 2 between binary matrices. If one array is - passed, distances are computed between its rows. For two arrays, distances - are between their respective rows, with `i`-th row and `j`-th column in output - corresponding to `i`-th row from first array and `j`-th row from second array. + Computes the pairwise Sokal-Sneath distance 2 between binary matrices. If + one array is passed, distances are computed between its rows. For two arrays, + distances are between their respective rows, with `i`-th row and `j`-th + column in output corresponding to `i`-th row from first array and `j`-th row + from second array. See also :py:func:`sokal_sneath_2_binary_distance`. Parameters ---------- - X : ndarray - First binary input array, of shape :math:`m \times m` + X : ndarray or CSR sparse array + First binary input array, of shape :math:`m \times d`. - Y : ndarray, default=None - Second binary input array, of shape :math:`n \times n`. If not passed, distances - are computed between rows of X. + Y : ndarray or CSR sparse array, default=None + Second binary input array, of shape :math:`n \times d`. If not passed, + distances are computed between rows of X. Returns ------- distances : ndarray - Array with pairwise Sokal-Sneath distance 2 values. Shape is :math:`m \times n` if two - arrays are passed, or :math:`m \times m` otherwise. - - See Also - -------- - :py:func:`sokal_sneath_2_binary_distance` : Sokal-Sneath distance 2 function for two vectors + Array with pairwise Sokal-Sneath distance 2 values. Shape is + :math:`m \times n` if two arrays are passed, or :math:`m \times m` + otherwise.https://github.com/scikit-fingerprints/scikit-fingerprints/pull/488 Examples -------- @@ -330,15 +322,15 @@ def bulk_sokal_sneath_2_binary_distance( >>> import numpy as np >>> X = np.array([[1, 1, 1], [1, 0, 1]]) >>> Y = np.array([[1, 0, 1], [1, 1, 0]]) - >>> dist = bulk_sokal_sneath_2_binary_distance(X, Y) - >>> dist + >>> bulk_sokal_sneath_2_binary_distance(X, Y) array([[0.5, 0.5], [0. , 0.8]]) - >>> X = np.array([[1, 1, 1], [1, 0, 0]]) - >>> dist = bulk_sokal_sneath_2_binary_distance(X) - >>> dist - array([[0. , 0.8], - [0.8, 0. ]]) + >>> from scipy.sparse import csr_array + >>> X = csr_array([[1, 1, 1], [1, 0, 1]]) + >>> Y = csr_array([[1, 0, 1], [1, 1, 0]]) + >>> bulk_sokal_sneath_2_binary_distance(X, Y) + array([[0.5, 0.5], + [0. , 0.8]]) """ return 1 - bulk_sokal_sneath_2_binary_similarity(X, Y) diff --git a/tests/distances/braun_blanquet.py b/tests/distances/braun_blanquet.py index 25e393fe..79a621d5 100644 --- a/tests/distances/braun_blanquet.py +++ b/tests/distances/braun_blanquet.py @@ -1,6 +1,4 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import ( braun_blanquet_binary_distance, @@ -10,7 +8,11 @@ bulk_braun_blanquet_binary_distance, bulk_braun_blanquet_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -28,52 +30,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_braun_blanquet(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = braun_blanquet_binary_similarity(vec_a, vec_b) - dist_dense = braun_blanquet_binary_distance(vec_a, vec_b) - - sim_sparse = braun_blanquet_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = braun_blanquet_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_braun_blanquet_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [braun_blanquet_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [braun_blanquet_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_braun_blanquet_binary_similarity(fps) - bulk_dist = bulk_braun_blanquet_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + braun_blanquet_binary_similarity, + braun_blanquet_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_braun_blanquet(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + braun_blanquet_binary_similarity, + braun_blanquet_binary_distance, + bulk_braun_blanquet_binary_similarity, + bulk_braun_blanquet_binary_distance, + ) def test_bulk_braun_blanquet_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_braun_blanquet_binary_similarity(fps) - bulk_sim_two = bulk_braun_blanquet_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + braun_blanquet_binary_similarity, + braun_blanquet_binary_distance, + bulk_braun_blanquet_binary_similarity, + bulk_braun_blanquet_binary_distance, + ) diff --git a/tests/distances/harris_lahey.py b/tests/distances/harris_lahey.py index d24544ff..74f92e6d 100644 --- a/tests/distances/harris_lahey.py +++ b/tests/distances/harris_lahey.py @@ -1,6 +1,4 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import ( harris_lahey_binary_distance, @@ -10,7 +8,11 @@ bulk_harris_lahey_binary_distance, bulk_harris_lahey_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_unnormalized_values() -> list[tuple[list[int], list[int], float, float]]: @@ -41,72 +43,49 @@ def _get_normalized_values() -> list[tuple[list[int], list[int], float, float]]: "vec_a, vec_b, similarity, distance", _get_unnormalized_values() ) def test_harris_lahey_unnormalized(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = harris_lahey_binary_similarity(vec_a, vec_b) - dist_dense = harris_lahey_binary_distance(vec_a, vec_b) - - sim_sparse = harris_lahey_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = harris_lahey_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) + run_test_similarity_and_distance( + harris_lahey_binary_similarity, + harris_lahey_binary_distance, + vec_a, + vec_b, + similarity, + distance, + normalized=False, + ) @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_normalized_values()) def test_harris_lahey_normalized(vec_a, vec_b, similarity, distance): - # only similarity, since distance is always normalized - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = harris_lahey_binary_similarity(vec_a, vec_b, normalized=True) - sim_sparse = harris_lahey_binary_similarity( - vec_a_sparse, vec_b_sparse, normalized=True + run_test_similarity_and_distance( + harris_lahey_binary_similarity, + harris_lahey_binary_distance, + vec_a, + vec_b, + similarity, + distance, + normalized=True, ) - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - - -def test_bulk_harris_lahey_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [harris_lahey_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [harris_lahey_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_harris_lahey_binary_similarity(fps) - bulk_dist = bulk_harris_lahey_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) +@pytest.mark.parametrize("normalized", [False, True]) +def test_bulk_harris_lahey(mols_list, normalized): + run_test_bulk_similarity_and_distance( + mols_list, + harris_lahey_binary_similarity, + harris_lahey_binary_distance, + bulk_harris_lahey_binary_similarity, + bulk_harris_lahey_binary_distance, + normalized=normalized, + ) -def test_bulk_harris_lahey_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - bulk_sim_single = bulk_harris_lahey_binary_similarity(fps) - bulk_sim_two = bulk_harris_lahey_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) +@pytest.mark.parametrize("normalized", [False, True]) +def test_bulk_harris_lahey_second_array(mols_list, normalized): + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + harris_lahey_binary_similarity, + harris_lahey_binary_distance, + bulk_harris_lahey_binary_similarity, + bulk_harris_lahey_binary_distance, + normalized=normalized, + ) diff --git a/tests/distances/kulczynski.py b/tests/distances/kulczynski.py index f8f1565b..25cfd815 100644 --- a/tests/distances/kulczynski.py +++ b/tests/distances/kulczynski.py @@ -1,6 +1,4 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import ( kulczynski_binary_distance, @@ -10,7 +8,11 @@ bulk_kulczynski_binary_distance, bulk_kulczynski_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -27,52 +29,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_kulczynski(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = kulczynski_binary_similarity(vec_a, vec_b) - dist_dense = kulczynski_binary_distance(vec_a, vec_b) - - sim_sparse = kulczynski_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = kulczynski_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_kulczynski_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [kulczynski_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [kulczynski_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_kulczynski_binary_similarity(fps) - bulk_dist = bulk_kulczynski_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + kulczynski_binary_similarity, + kulczynski_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_kulczynski(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + kulczynski_binary_similarity, + kulczynski_binary_distance, + bulk_kulczynski_binary_similarity, + bulk_kulczynski_binary_distance, + ) def test_bulk_kulczynski_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_kulczynski_binary_similarity(fps) - bulk_sim_two = bulk_kulczynski_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + kulczynski_binary_similarity, + kulczynski_binary_distance, + bulk_kulczynski_binary_similarity, + bulk_kulczynski_binary_distance, + ) diff --git a/tests/distances/mcconnaughey.py b/tests/distances/mcconnaughey.py index 91f11ba2..fbf0529a 100644 --- a/tests/distances/mcconnaughey.py +++ b/tests/distances/mcconnaughey.py @@ -1,81 +1,86 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import mcconnaughey_binary_distance, mcconnaughey_binary_similarity from skfp.distances.mcconnaughey import ( bulk_mcconnaughey_binary_distance, bulk_mcconnaughey_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) -def _get_values() -> list[tuple[list[int], list[int], float, float, bool]]: - # vec_a, vec_b, similarity, distance, normalized +def _get_unnormalized_values() -> list[tuple[list[int], list[int], float, float]]: + # vec_a, vec_b, similarity, distance return [ - ([1, 0, 0], [0, 1, 1], 0.0, 1.0, True), - ([1, 0, 0], [0, 0, 0], 0.0, 1.0, True), - ([0, 0, 0], [0, 0, 0], 1.0, 0.0, True), - ([1, 0, 0], [1, 0, 0], 1.0, 0.0, True), - ([1, 1, 1], [1, 1, 1], 1.0, 0.0, True), - ([1, 1, 1, 0], [1, 1, 1, 1], 0.875, 0.125, True), - ([1, 0, 0], [0, 1, 1], -1.0, 1.0, False), - ([0, 0, 0], [0, 0, 0], 1.0, 0.0, False), - ([1, 0, 0], [1, 0, 0], 1.0, 0.0, False), - ([1, 1, 1], [1, 1, 1], 1.0, 0.0, False), + ([1, 0, 0], [0, 1, 1], -1.0, 1.0), + ([0, 0, 0], [0, 0, 0], 1.0, 0.0), + ([1, 0, 0], [1, 0, 0], 1.0, 0.0), + ([1, 1, 1], [1, 1, 1], 1.0, 0.0), ] -@pytest.mark.parametrize( - "vec_a, vec_b, similarity, distance, normalized", _get_values() -) -def test_mcconnaughey(vec_a, vec_b, similarity, distance, normalized): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = mcconnaughey_binary_similarity(vec_a, vec_b, normalized) - dist_dense = mcconnaughey_binary_distance(vec_a, vec_b) - - sim_sparse = mcconnaughey_binary_similarity(vec_a_sparse, vec_b_sparse, normalized) - dist_sparse = mcconnaughey_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_mcconaughey_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [mcconnaughey_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [mcconnaughey_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) +def _get_normalized_values() -> list[tuple[list[int], list[int], float, float]]: + # vec_a, vec_b, similarity, distance + return [ + ([1, 0, 0], [0, 1, 1], 0.0, 1.0), + ([1, 0, 0], [0, 0, 0], 0.0, 1.0), + ([0, 0, 0], [0, 0, 0], 1.0, 0.0), + ([1, 0, 0], [1, 0, 0], 1.0, 0.0), + ([1, 1, 1], [1, 1, 1], 1.0, 0.0), + ([1, 1, 1, 0], [1, 1, 1, 1], 0.875, 0.125), ] - bulk_sim = bulk_mcconnaughey_binary_similarity(fps) - bulk_dist = bulk_mcconnaughey_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) - -def test_bulk_mcconaughey_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_mcconnaughey_binary_similarity(fps) - bulk_sim_two = bulk_mcconnaughey_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) +@pytest.mark.parametrize( + "vec_a, vec_b, similarity, distance", _get_unnormalized_values() +) +def test_mcconnaughey_unnormalized(vec_a, vec_b, similarity, distance): + run_test_similarity_and_distance( + mcconnaughey_binary_similarity, + mcconnaughey_binary_distance, + vec_a, + vec_b, + similarity, + distance, + normalized=False, + ) + + +@pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_normalized_values()) +def test_mcconnaughey_normalized(vec_a, vec_b, similarity, distance): + run_test_similarity_and_distance( + mcconnaughey_binary_similarity, + mcconnaughey_binary_distance, + vec_a, + vec_b, + similarity, + distance, + normalized=True, + ) + + +@pytest.mark.parametrize("normalized", [False, True]) +def test_bulk_mcconnaughey(mols_list, normalized): + run_test_bulk_similarity_and_distance( + mols_list, + mcconnaughey_binary_similarity, + mcconnaughey_binary_distance, + bulk_mcconnaughey_binary_similarity, + bulk_mcconnaughey_binary_distance, + normalized=normalized, + ) + + +@pytest.mark.parametrize("normalized", [False, True]) +def test_bulk_mcconnaughey_second_array(mols_list, normalized): + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + mcconnaughey_binary_similarity, + mcconnaughey_binary_distance, + bulk_mcconnaughey_binary_similarity, + bulk_mcconnaughey_binary_distance, + normalized=normalized, + ) diff --git a/tests/distances/rand.py b/tests/distances/rand.py index 049892d6..8be0d9e1 100644 --- a/tests/distances/rand.py +++ b/tests/distances/rand.py @@ -10,7 +10,7 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: - # vec_a, vec_b, comparison, similarity, distance + # vec_a, vec_b, similarity, distance return [ ([1, 0, 0], [0, 1, 1], 0.0, 1.0), ([1, 0, 0], [0, 0, 0], 2 / 3, 1 / 3), diff --git a/tests/distances/rogot_goldberg.py b/tests/distances/rogot_goldberg.py index 22839ffa..5742fb79 100644 --- a/tests/distances/rogot_goldberg.py +++ b/tests/distances/rogot_goldberg.py @@ -1,6 +1,4 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import ( rogot_goldberg_binary_distance, @@ -10,7 +8,11 @@ bulk_rogot_goldberg_binary_distance, bulk_rogot_goldberg_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -27,52 +29,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_rogot_goldberg(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = rogot_goldberg_binary_similarity(vec_a, vec_b) - dist_dense = rogot_goldberg_binary_distance(vec_a, vec_b) - - sim_sparse = rogot_goldberg_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = rogot_goldberg_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_rogot_goldberg_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [rogot_goldberg_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [rogot_goldberg_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_rogot_goldberg_binary_similarity(fps) - bulk_dist = bulk_rogot_goldberg_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + rogot_goldberg_binary_similarity, + rogot_goldberg_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_rogot_goldberg(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + rogot_goldberg_binary_similarity, + rogot_goldberg_binary_distance, + bulk_rogot_goldberg_binary_similarity, + bulk_rogot_goldberg_binary_distance, + ) def test_bulk_rogot_goldberg_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_rogot_goldberg_binary_similarity(fps) - bulk_sim_two = bulk_rogot_goldberg_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + rogot_goldberg_binary_similarity, + rogot_goldberg_binary_distance, + bulk_rogot_goldberg_binary_similarity, + bulk_rogot_goldberg_binary_distance, + ) diff --git a/tests/distances/russell.py b/tests/distances/russell.py index a5244d5c..cfdd3a5b 100644 --- a/tests/distances/russell.py +++ b/tests/distances/russell.py @@ -1,13 +1,15 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import russell_binary_distance, russell_binary_similarity from skfp.distances.russell import ( bulk_russell_binary_distance, bulk_russell_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -25,52 +27,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_russell(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = russell_binary_similarity(vec_a, vec_b) - dist_dense = russell_binary_distance(vec_a, vec_b) - - sim_sparse = russell_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = russell_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_russell_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [russell_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [russell_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_russell_binary_similarity(fps) - bulk_dist = bulk_russell_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + russell_binary_similarity, + russell_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_russell(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + russell_binary_similarity, + russell_binary_distance, + bulk_russell_binary_similarity, + bulk_russell_binary_distance, + ) def test_bulk_russell_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_russell_binary_similarity(fps) - bulk_sim_two = bulk_russell_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + russell_binary_similarity, + russell_binary_distance, + bulk_russell_binary_similarity, + bulk_russell_binary_distance, + ) diff --git a/tests/distances/simpson.py b/tests/distances/simpson.py index fe3e71bf..fc18ecca 100644 --- a/tests/distances/simpson.py +++ b/tests/distances/simpson.py @@ -1,13 +1,15 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import simpson_binary_distance, simpson_binary_similarity from skfp.distances.simpson import ( bulk_simpson_binary_distance, bulk_simpson_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -25,52 +27,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_simpson(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = simpson_binary_similarity(vec_a, vec_b) - dist_dense = simpson_binary_distance(vec_a, vec_b) - - sim_sparse = simpson_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = simpson_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_simpson_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [simpson_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [simpson_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_simpson_binary_similarity(fps) - bulk_dist = bulk_simpson_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + simpson_binary_similarity, + simpson_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_simpson(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + simpson_binary_similarity, + simpson_binary_distance, + bulk_simpson_binary_similarity, + bulk_simpson_binary_distance, + ) def test_bulk_simpson_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_simpson_binary_similarity(fps) - bulk_sim_two = bulk_simpson_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + simpson_binary_similarity, + simpson_binary_distance, + bulk_simpson_binary_similarity, + bulk_simpson_binary_distance, + ) diff --git a/tests/distances/sokal_sneath.py b/tests/distances/sokal_sneath.py index 376eb0a2..2e4feef8 100644 --- a/tests/distances/sokal_sneath.py +++ b/tests/distances/sokal_sneath.py @@ -1,6 +1,4 @@ -import numpy as np import pytest -from scipy.sparse import csr_array from skfp.distances import ( sokal_sneath_2_binary_distance, @@ -10,7 +8,11 @@ bulk_sokal_sneath_2_binary_distance, bulk_sokal_sneath_2_binary_similarity, ) -from skfp.fingerprints.ecfp import ECFPFingerprint +from tests.distances.utils import ( + run_test_bulk_similarity_and_distance, + run_test_bulk_similarity_and_distance_two_arrays, + run_test_similarity_and_distance, +) def _get_values() -> list[tuple[list[int], list[int], float, float]]: @@ -28,52 +30,31 @@ def _get_values() -> list[tuple[list[int], list[int], float, float]]: @pytest.mark.parametrize("vec_a, vec_b, similarity, distance", _get_values()) def test_sokal_sneath_2(vec_a, vec_b, similarity, distance): - vec_a = np.array(vec_a) - vec_b = np.array(vec_b) - - vec_a_sparse = csr_array([vec_a]) - vec_b_sparse = csr_array([vec_b]) - - sim_dense = sokal_sneath_2_binary_similarity(vec_a, vec_b) - dist_dense = sokal_sneath_2_binary_distance(vec_a, vec_b) - - sim_sparse = sokal_sneath_2_binary_similarity(vec_a_sparse, vec_b_sparse) - dist_sparse = sokal_sneath_2_binary_distance(vec_a_sparse, vec_b_sparse) - - assert np.isclose(sim_dense, similarity, atol=1e-3) - assert np.isclose(sim_sparse, similarity, atol=1e-3) - - assert np.isclose(dist_dense, distance, atol=1e-3) - assert np.isclose(dist_sparse, distance, atol=1e-3) - - assert np.isclose(sim_dense, sim_sparse) - assert np.isclose(dist_dense, dist_sparse) - - -def test_bulk_sokal_sneath_2_binary(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - pairwise_sim = [ - [sokal_sneath_2_binary_similarity(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - pairwise_dist = [ - [sokal_sneath_2_binary_distance(fps[i], fps[j]) for j in range(len(fps))] - for i in range(len(fps)) - ] - - bulk_sim = bulk_sokal_sneath_2_binary_similarity(fps) - bulk_dist = bulk_sokal_sneath_2_binary_distance(fps) - - assert np.allclose(pairwise_sim, bulk_sim) - assert np.allclose(pairwise_dist, bulk_dist) + run_test_similarity_and_distance( + sokal_sneath_2_binary_similarity, + sokal_sneath_2_binary_distance, + vec_a, + vec_b, + similarity, + distance, + ) + + +def test_bulk_sokal_sneath_2(mols_list): + run_test_bulk_similarity_and_distance( + mols_list, + sokal_sneath_2_binary_similarity, + sokal_sneath_2_binary_distance, + bulk_sokal_sneath_2_binary_similarity, + bulk_sokal_sneath_2_binary_distance, + ) def test_bulk_sokal_sneath_2_second_array(mols_list): - fp = ECFPFingerprint() - fps = fp.transform(mols_list[:10]) - - bulk_sim_single = bulk_sokal_sneath_2_binary_similarity(fps) - bulk_sim_two = bulk_sokal_sneath_2_binary_similarity(fps, fps) - assert np.allclose(bulk_sim_single, bulk_sim_two) + run_test_bulk_similarity_and_distance_two_arrays( + mols_list, + sokal_sneath_2_binary_similarity, + sokal_sneath_2_binary_distance, + bulk_sokal_sneath_2_binary_similarity, + bulk_sokal_sneath_2_binary_distance, + )