diff --git a/hdbscan/_hdbscan_reachability.pyx b/hdbscan/_hdbscan_reachability.pyx index df0f7b8b..621bca64 100644 --- a/hdbscan/_hdbscan_reachability.pyx +++ b/hdbscan/_hdbscan_reachability.pyx @@ -1,15 +1,16 @@ # cython: boundscheck=False +# cython: wraparound=False # cython: nonecheck=False # cython: initializedcheck=False -# mutual reachability distance compiutations +# mutual reachability distance computations # Authors: Leland McInnes # License: 3-clause BSD import numpy as np cimport numpy as np +from numpy.math cimport INFINITY, isfinite from scipy.spatial.distance import pdist, squareform -from scipy.sparse import lil_matrix as sparse_matrix from sklearn.neighbors import KDTree, BallTree import gc @@ -59,44 +60,89 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): return result -cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, - float alpha=1.0, float max_dist=0.): - - cdef np.intp_t i - cdef np.intp_t j - cdef np.intp_t n - cdef np.double_t mr_dist - cdef list sorted_row_data - cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance - cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data - cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data - - result = sparse_matrix(lil_matrix.shape) - core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) - - for i in range(lil_matrix.shape[0]): - sorted_row_data = sorted(lil_matrix.data[i]) - if min_points < len(sorted_row_data): - core_distance[i] = sorted_row_data[min_points] +cpdef sparse_mutual_reachability(object distance_matrix, np.intp_t min_points=5, + float alpha=1.0, float max_dist=0.0): + """ Compute mutual reachability for distance matrix. For best performance + pass distance_matrix in CSR form which will modify it in place and return + it without making a copy """ + + # tocsr() is a fast noop if distance_matrix is already CSR + D = distance_matrix.tocsr() + if D.shape != (D.shape[0], D.shape[0]): + raise Exception("Distance matrix must be 2D square shaped instead of {}".format(D.shape)) + # Convert to smallest supported data type if necessary + if D.dtype not in (np.float32, np.float64): + if D.dtype <= np.dtype(np.float32): + D = D.astype(np.float32) else: - core_distance[i] = np.infty - - if alpha != 1.0: - lil_matrix = lil_matrix / alpha - - nz_row_data, nz_col_data = lil_matrix.nonzero() - - for n in range(nz_row_data.shape[0]): - i = nz_row_data[n] - j = nz_col_data[n] - - mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) - if np.isfinite(mr_dist): - result[i, j] = mr_dist - elif max_dist > 0: - result[i, j] = max_dist - - return result.tocsr() + D = D.astype(np.float64) + + # Call typed function which modifies D in place + t = (D.data.dtype, D.indices.dtype, D.indptr.dtype) + if t == (np.float32, np.int32, np.int32): + sparse_mr_fast[np.float32_t, np.int32_t](D.data, D.indices, D.indptr, + min_points, alpha, max_dist) + elif t == (np.float32, np.int64, np.int64): + sparse_mr_fast[np.float32_t, np.int64_t](D.data, D.indices, D.indptr, + min_points, alpha, max_dist) + elif t == (np.float64, np.int32, np.int32): + sparse_mr_fast[np.float64_t, np.int32_t](D.data, D.indices, D.indptr, + min_points, alpha, max_dist) + elif t == (np.float64, np.int64, np.int64): + sparse_mr_fast[np.float64_t, np.int64_t](D.data, D.indices, D.indptr, + min_points, alpha, max_dist) + else: + raise Exception("Unsupported CSR format {}".format(t)) + + return D + + +ctypedef fused mr_indx_t: + np.int32_t + np.int64_t + +ctypedef fused mr_data_t: + np.float32_t + np.float64_t + +cdef sparse_mr_fast(np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] data, + np.ndarray[dtype=mr_indx_t, ndim=1, mode='c'] indices, + np.ndarray[dtype=mr_indx_t, ndim=1, mode='c'] indptr, + mr_indx_t min_points, + mr_data_t alpha, + mr_data_t max_dist): + cdef mr_indx_t row, col, colptr + cdef mr_data_t mr_dist + cdef np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] row_data + cdef np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] core_distance + + core_distance = np.empty(indptr.shape[0]-1, dtype=data.dtype) + + for row in range(indptr.shape[0]-1): + row_data = data[indptr[row]:indptr[row+1]].copy() + if min_points < row_data.shape[0]: + # sort is faster for small arrays because of lower startup cost but + # partition has worst case O(n) runtime for larger ones. + # https://stackoverflow.com/questions/43588711/numpys-partition-slower-than-sort-for-small-arrays + if row_data.shape[0] > 200: + row_data.partition(min_points) + else: + row_data.sort() + core_distance[row] = row_data[min_points] + else: + core_distance[row] = INFINITY + + if alpha != 1.0: + data /= alpha + + for row in range(indptr.shape[0]-1): + for colptr in range(indptr[row],indptr[row+1]): + col = indices[colptr] + mr_dist = max(core_distance[row], core_distance[col], data[colptr]) + if isfinite(mr_dist): + data[colptr] = mr_dist + elif max_dist > 0: + data[colptr] = max_dist def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py index 31b7f84a..5c29e0e5 100644 --- a/hdbscan/hdbscan_.py +++ b/hdbscan/hdbscan_.py @@ -78,13 +78,16 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, # sklearn.metrics.pairwise_distances handle it, # enables the usage of numpy.inf in the distance # matrix to indicate missing distance information. - # TODO: Check if copying is necessary - distance_matrix = X.copy() + # Give the user the option to have the distance matrix + # modified to save memory. + if kwargs.get("overwrite", False): + distance_matrix = X + else: + distance_matrix = X.copy() else: distance_matrix = pairwise_distances(X, metric=metric, **kwargs) if issparse(distance_matrix): - # raise TypeError('Sparse distance matrices not yet supported') return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples, alpha, metric, p, leaf_size, gen_min_span_tree, @@ -141,12 +144,11 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0, 'relations connecting them\n' 'Run hdbscan on each component.') - lil_matrix = X.tolil() - # Compute sparse mutual reachability graph # if max_dist > 0, max distance to use when the reachability is infinite max_dist = kwargs.get("max_dist", 0.) - mutual_reachability_ = sparse_mutual_reachability(lil_matrix, + # sparse_mutual_reachability() may modify X in place and return it + mutual_reachability_ = sparse_mutual_reachability(X, min_points=min_samples, max_dist=max_dist, alpha=alpha) @@ -163,7 +165,7 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0, # Compute the minimum spanning tree for the sparse graph sparse_min_spanning_tree = csgraph.minimum_spanning_tree( - mutual_reachability_) + mutual_reachability_, overwrite=True) # Convert the graph to scipy cluster array format nonzeros = sparse_min_spanning_tree.nonzero() diff --git a/hdbscan/tests/test_hdbscan.py b/hdbscan/tests/test_hdbscan.py index c2d5201d..c9fc1f73 100644 --- a/hdbscan/tests/test_hdbscan.py +++ b/hdbscan/tests/test_hdbscan.py @@ -154,6 +154,12 @@ def test_hdbscan_sparse_distance_matrix(): n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) + # Verify single and double precision return same results + assert_equal(D.dtype, np.double) + labels_double = hdbscan(D, metric='precomputed')[0] + labels_single = hdbscan(D.astype(np.single), metric='precomputed')[0] + assert_array_equal(labels_single, labels_double) + def test_hdbscan_feature_vector(): labels, p, persist, ctree, ltree, mtree = hdbscan(X)