Skip to content

Commit 2ed7f74

Browse files
authored
Merge pull request #260 from thomasopsomer/sparse_msg
Better error message in sparse precomputed mode. [#253]
2 parents 9840b7e + 38f541b commit 2ed7f74

File tree

2 files changed

+22
-8
lines changed

2 files changed

+22
-8
lines changed

hdbscan/_hdbscan_reachability.pyx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=1.0):
6060

6161

6262
cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
63-
float alpha=1.0):
63+
float alpha=1.0, float max_dist=0.):
6464

6565
cdef np.intp_t i
6666
cdef np.intp_t j
@@ -90,6 +90,8 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
9090
mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
9191
if np.isfinite(mr_dist):
9292
result[i, j] = mr_dist
93+
elif max_dist > 0:
94+
result[i, j] = max_dist
9395

9496
return result.tocsr()
9597

hdbscan/hdbscan_.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,20 +131,32 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,
131131
metric='minkowski', p=2, leaf_size=40,
132132
gen_min_span_tree=False, **kwargs):
133133
assert issparse(X)
134+
# Check for connected component on X
135+
if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
136+
raise ValueError('Sparse distance matrix has multiple connected '
137+
'components!\nThat is, there exist groups of points '
138+
'that are completely disjoint -- there are no distance '
139+
'relations connecting them\n'
140+
'Run hdbscan on each component.')
134141

135142
lil_matrix = X.tolil()
136143

137144
# Compute sparse mutual reachability graph
145+
# if max_dist > 0, max distance to use when the reachability is infinite
146+
max_dist = kwargs.get("max_dist", 0.)
138147
mutual_reachability_ = sparse_mutual_reachability(lil_matrix,
139-
min_points=min_samples)
140-
148+
min_points=min_samples,
149+
max_dist=max_dist)
150+
# Check connected component on mutual reachability
151+
# If more than one component, it means that even if the distance matrix X
152+
# has one component, there exists with less than `min_samples` neighbors
141153
if csgraph.connected_components(mutual_reachability_, directed=False,
142154
return_labels=False) > 1:
143-
raise ValueError('Sparse distance matrix has multiple connected'
144-
' components!\nThat is, there exist groups of points '
145-
'that are completely disjoint -- there are no distance '
146-
'relations connecting them\n'
147-
'Run hdbscan on each component.')
155+
raise ValueError(('There exists points with less than %s neighbors. '
156+
'Ensure your distance matrix has non zeros values for '
157+
'at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), '
158+
'or specify a `max_dist` to use when distances are missing.')
159+
% (min_samples, min_samples))
148160

149161
# Compute the minimum spanning tree for the sparse graph
150162
sparse_min_spanning_tree = csgraph.minimum_spanning_tree(

0 commit comments

Comments
 (0)