diff --git a/debacl/__init__.py b/debacl/__init__.py index 5e13b62..4b65494 100644 --- a/debacl/__init__.py +++ b/debacl/__init__.py @@ -19,6 +19,7 @@ from debacl.level_set_tree import construct_tree from debacl.level_set_tree import construct_tree_from_graph +from debacl.level_set_tree import construct_tree_from_distance_matrix from debacl.level_set_tree import load_tree from debacl.level_set_tree import LevelSetTree diff --git a/debacl/level_set_tree.py b/debacl/level_set_tree.py index cdeef9d..9e92903 100644 --- a/debacl/level_set_tree.py +++ b/debacl/level_set_tree.py @@ -66,8 +66,9 @@ class LevelSetTree(object): LevelSetTree objects should not generally be instantiated directly, because they will contain empty node hierarchies. Use the tree - constructors :func:`construct_tree` or - :func:`construct_tree_from_graph` to instantiate a LevelSetTree model. + constructors :func:`construct_tree`, :func:`construct_tree_from_graph`, + or :func: `construct_tree_from_distance_matrix` to instantiate a + LevelSetTree model. Parameters ---------- @@ -1260,6 +1261,76 @@ def construct_tree(X, k, prune_threshold=None, num_levels=None, verbose=False): return tree +def construct_tree_from_distance_matrix(X, p, k, prune_threshold=None, + num_levels=None, verbose=False): + """ + Construct a level set tree from a precomputed distance matrix. + + Parameters + ---------- + X : 2-dimensional numpy array + A square distance matrix + + p: int + Underlying dimensionality of the data that was used to create X, the + distance matrix + + k : int + Number of observations to consider as neighbors to a given point. + + prune_threshold : int, optional + Leaf nodes with fewer than this number of members are recursively + merged into larger nodes. If 'None' (the default), then no pruning + is performed. + + num_levels : int, optional + Number of density levels in the constructed tree. If None (default), + `num_levels` is internally set to be the number of rows in `X`. + + verbose : bool, optional + If True, a progress indicator is printed at every 100th level of tree + construction. + + Returns + ------- + T : LevelSetTree + A pruned level set tree. + + See Also + -------- + construct_tree, construct_tree_from_graph, LevelSetTree + + Examples + -------- + >>> X = numpy.random.rand(100, 2) + >>> distances = scipy.spatial.distance.pdist(X) + >>> distance_matrix = scipy.spatial.distance.squareform(distances) + >>> tree = debacl.construct_tree_from_distance_matrix(distance_matrix, p=2, + ... k=8, + ... prune_threshold=5) + >>> print(tree) + +----+-------------+-----------+------------+----------+------+--------+----------+ + | id | start_level | end_level | start_mass | end_mass | size | parent | children | + +----+-------------+-----------+------------+----------+------+--------+----------+ + | 0 | 0.000 | 0.870 | 0.000 | 0.450 | 100 | None | [3, 4] | + | 3 | 0.870 | 3.364 | 0.450 | 0.990 | 17 | 0 | [] | + | 4 | 0.870 | 1.027 | 0.450 | 0.520 | 35 | 0 | [7, 8] | + | 7 | 1.027 | 1.755 | 0.520 | 0.870 | 8 | 4 | [] | + | 8 | 1.027 | 3.392 | 0.520 | 1.000 | 23 | 4 | [] | + +----+-------------+-----------+------------+----------+------+--------+----------+ + """ + + sim_graph, radii = _utl.knn_graph(X, k, method='precomputed') + + n, _ = X.shape + density = _utl.knn_density(radii, n, p, k) + + tree = construct_tree_from_graph(adjacency_list=sim_graph, density=density, + prune_threshold=prune_threshold, + num_levels=num_levels, verbose=verbose) + return tree + + def construct_tree_from_graph(adjacency_list, density, prune_threshold=None, num_levels=None, verbose=False): """ diff --git a/debacl/test/test_lst.py b/debacl/test/test_lst.py index a950ed6..0cb76e4 100644 --- a/debacl/test/test_lst.py +++ b/debacl/test/test_lst.py @@ -183,6 +183,19 @@ def test_construct_from_data(self): self._check_tree_viability(tree) self._check_tree_correctness(tree) + def test_construct_from_distance_matrix(self): + """ + Check viability and correctness of an LST constructed from a distance + matrix. + """ + distances = scipy.spatial.distance.pdist(self.dataset) + distance_matrix = scipy.spatial.distance.squareform(distances) + tree = dcl.construct_tree_from_distance_matrix(distance_matrix, self.k, + prune_threshold=self.gamma) + + self._check_tree_viability(tree) + self._check_tree_correctness(tree) + def test_load(self): """ Check viability and correctness of an LST saved then loaded from file. diff --git a/debacl/test/test_utils.py b/debacl/test/test_utils.py index c2c43d8..5d7c421 100644 --- a/debacl/test/test_utils.py +++ b/debacl/test/test_utils.py @@ -75,6 +75,16 @@ def test_knn_graph(self): for neighbors, ans_neighbors in zip(knn, ans_graph): self.assertItemsEqual(neighbors, ans_neighbors) + distances = scipy.spatial.distance.pdist(self.X) + distance_matrix = scipy.spatial.distance.squareform(distances) + knn, radii = utl.knn_graph(distance_matrix, k, method='precomputed') + + ## Test precomputed method + assert_array_equal(radii, ans_radii) + + for neighbors, ans_neighbors in zip(knn, ans_graph): + self.assertItemsEqual(neighbors, ans_neighbors) + def test_epsilon_graph(self): """ Test construction of the epsilon-nearest neighbor graph. diff --git a/debacl/utils.py b/debacl/utils.py index 7552c9b..b4dd742 100644 --- a/debacl/utils.py +++ b/debacl/utils.py @@ -38,12 +38,13 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): Parameters ---------- X : numpy array | list [numpy arrays] - Data points, with each row as an observation. + Data points, with each row as an observation + OR a distance matrix if method='precomputed' k : int The number of points to consider as neighbors of any given observation. - method : {'brute-force', 'kd-tree', 'ball-tree'}, optional + method : {'brute-force', 'kd-tree', 'ball-tree', 'precomputed'}, optional Computing method. - 'brute-force': computes the (Euclidean) distance between all O(n^2) @@ -61,11 +62,14 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): distances. Typically much faster than 'brute-force', and works with up to a few hundred dimensions. Requires the scikit-learn library. + - 'precomputed': used when the matrix X is a precomputed distance + matrix. + leaf_size : int, optional For the 'kd-tree' and 'ball-tree' methods, the number of observations in the leaf nodes. Leaves are not split further, so distance computations within leaf nodes are done by brute force. 'leaf_size' is - ignored for the 'brute-force' method. + ignored for the 'brute-force' and 'precomputed' methods. Returns ------- @@ -109,6 +113,17 @@ def knn_graph(X, k, method='brute_force', leaf_size=30): raise ImportError("The scikit-learn library could not be loaded." + " It is required for the 'ball-tree' method.") + if method == 'precomputed': + if X.ndim != 2 or X.shape[0] != X.shape[1]: + raise ValueError("array 'X' must be square") + + # Pseudo-sort each row of the distance matrix so that the indices of + # the closest k elements are listed first # The entries [...,:k-1] are + # not in sorted order by distance, but the kth entry is guaranteed to + # be in the correct spot + neighbors = _np.argpartition(X, k - 1)[:, :k] + radii = X[_np.arange(n), neighbors[:, -1]] + else: # assume brute-force if not _HAS_SCIPY: raise ImportError("The 'scipy' module could not be loaded. " +