Skip to content

Commit 5425cd2

Browse files
authored
Merge pull request #394 from GregDemand/master
Fixed off by one errors in min_samples for multiple algorithms
2 parents 54da636 + 2555a38 commit 5425cd2

File tree

4 files changed

+20
-13
lines changed

4 files changed

+20
-13
lines changed

hdbscan/_hdbscan_boruvka.pyx

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -454,8 +454,10 @@ cdef class KDTreeBoruvkaAlgorithm (object):
454454
# issues, but we'll get quite a few, and they are the hard ones to
455455
# get, so fill in any we can and then run update components.
456456
for n in range(self.num_points):
457-
for i in range(1, self.min_samples + 1):
457+
for i in range(0, self.min_samples + 1):
458458
m = knn_indices[n, i]
459+
if n == m:
460+
continue
459461
if self.core_distance[m] <= self.core_distance[n]:
460462
self.candidate_point[n] = n
461463
self.candidate_neighbor[n] = m
@@ -745,7 +747,7 @@ cdef class KDTreeBoruvkaAlgorithm (object):
745747
# then propagate the results of that computation
746748
# up the tree.
747749
new_bound = min(new_upper_bound,
748-
new_lower_bound + 2 * node1_info.radius)
750+
new_lower_bound + 2 * self.dist._dist_to_rdist(node1_info.radius))
749751
# new_bound = new_upper_bound
750752
if new_bound < self.bounds_ptr[node1]:
751753
self.bounds_ptr[node1] = new_bound
@@ -1028,33 +1030,36 @@ cdef class BallTreeBoruvkaAlgorithm (object):
10281030
knn_data = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
10291031
delayed(_core_dist_query)
10301032
(self.core_dist_tree, points,
1031-
self.min_samples)
1033+
self.min_samples + 1)
10321034
for points in datasets)
10331035
knn_dist = np.vstack([x[0] for x in knn_data])
10341036
knn_indices = np.vstack([x[1] for x in knn_data])
10351037
else:
10361038
knn_dist, knn_indices = self.core_dist_tree.query(
10371039
self.tree.data,
1038-
k=self.min_samples,
1040+
k=self.min_samples + 1,
10391041
dualtree=True,
10401042
breadth_first=True)
10411043

1042-
self.core_distance_arr = knn_dist[:, self.min_samples - 1].copy()
1044+
self.core_distance_arr = knn_dist[:, self.min_samples].copy()
10431045
self.core_distance = (<np.double_t[:self.num_points:1]> (
10441046
<np.double_t *> self.core_distance_arr.data))
10451047

10461048
# Since we already computed NN distances for the min_samples closest
10471049
# points we can use this to do the first round of boruvka -- we won't
10481050
# get every point due to core_distance/mutual reachability distance
10491051
# issues, but we'll get quite a few, and they are the hard ones to get,
1050-
# so fill in any we ca and then run update components.
1052+
# so fill in any we can and then run update components.
10511053
for n in range(self.num_points):
1052-
for i in range(self.min_samples - 1, 0):
1054+
for i in range(0, self.min_samples + 1):
10531055
m = knn_indices[n, i]
1056+
if n == m:
1057+
continue
10541058
if self.core_distance[m] <= self.core_distance[n]:
10551059
self.candidate_point[n] = n
10561060
self.candidate_neighbor[n] = m
10571061
self.candidate_distance[n] = self.core_distance[n]
1062+
break
10581063

10591064
self.update_components()
10601065

hdbscan/_hdbscan_reachability.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
7676

7777
for i in range(lil_matrix.shape[0]):
7878
sorted_row_data = sorted(lil_matrix.data[i])
79-
if min_points < len(sorted_row_data):
80-
core_distance[i] = sorted_row_data[min_points]
79+
if min_points - 1 < len(sorted_row_data):
80+
core_distance[i] = sorted_row_data[min_points - 1]
8181
else:
8282
core_distance[i] = np.infty
8383

hdbscan/hdbscan_.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,10 @@ def _hdbscan_prims_kdtree(
243243
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
244244

245245
# Get distance to kth nearest neighbour
246-
core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][
246+
core_distances = tree.query(X, k=min_samples + 1, dualtree=True, breadth_first=True)[0][
247247
:, -1
248248
].copy(order="C")
249+
249250
# Mutual reachability distance is implicit in mst_linkage_core_vector
250251
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)
251252

@@ -288,7 +289,7 @@ def _hdbscan_prims_balltree(
288289
dist_metric = DistanceMetric.get_metric(metric, **kwargs)
289290

290291
# Get distance to kth nearest neighbour
291-
core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][
292+
core_distances = tree.query(X, k=min_samples + 1, dualtree=True, breadth_first=True)[0][
292293
:, -1
293294
].copy(order="C")
294295

hdbscan/tests/test_flat.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@
2525
std = [0.5, 0.08, 0.06, 0.35, 0.35, 0.35]
2626
X0, y0 = make_blobs(n_samples=[70, 30, 80, 100, 40, 150],
2727
centers=centers,
28-
cluster_std=std)
29-
X1, y1 = make_moons(n_samples=300, noise=0.07)
28+
cluster_std=std,
29+
random_state=1)
30+
X1, y1 = make_moons(n_samples=300, noise=0.07, random_state=42)
3031
X1 += 3.
3132
y1 += len(centers)
3233
X = np.vstack((X0, X1))

0 commit comments

Comments
 (0)