Skip to content

Commit 6009994

Browse files
author
Frankie Robertson
committed
Initial commit with a few KNeighborsTransformers and RNN-DBSCAN
0 parents  commit 6009994

28 files changed

+2293
-0
lines changed

LICENSE

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Copyright (c) 2020, scikit-ann contributors
2+
All rights reserved.
3+
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are met:
6+
7+
* Redistributions of source code must retain the above copyright notice, this
8+
list of conditions and the following disclaimer.
9+
10+
* Redistributions in binary form must reproduce the above copyright notice,
11+
this list of conditions and the following disclaimer in the documentation
12+
and/or other materials provided with the distribution.
13+
14+
* Neither the name of project-template nor the names of its
15+
contributors may be used to endorse or promote products derived from
16+
this software without specific prior written permission.
17+
18+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
.. -*- mode: rst -*-
2+
3+
|ReadTheDocs|_
4+
5+
.. |ReadTheDocs| image:: https://readthedocs.org/projects/sklearn-template/badge/?version=latest
6+
.. _ReadTheDocs: https://sklearn-template.readthedocs.io/en/latest/?badge=latest
7+
8+
sklearn-ann
9+
===========
10+
11+
**sklearn-ann** eases integration of approximate nearest neighbours
12+
libraries such as annoy, nmslib and faiss into your sklearn
13+
pipelines. It consists of:
14+
15+
* ``Transformers`` conforming to the same interface as
16+
``KNeighborsTransformer`` which can be used to transform feature matrices
17+
into sparse distance matrices for use by any estimator that can deal with
18+
sparse distance matrices. Many, but not all, of scikit-learn's clustering and
19+
manifold learning algorithms can work with this kind of input.
20+
* RNN-DBSCAN: a variant of DBSCAN based on reverse nearest
21+
neighbours.
22+
23+
Why? When do I want this?
24+
=========================
25+
26+
The main scenarios in which this is needed is for performing
27+
*clustering or manifold learning or high dimensional data*. The
28+
reason is that currently the only neighbourhood algorithms which are
29+
build into scikit-learn are essentially the standard tree approaches
30+
to space partitioning: the ball tree and the K-D tree. These do not
31+
perform competitively in high dimensional spaces.

examples/rnn_dbscan_big.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
=======================================================
4+
Demo of RnnDBSCAN clustering algorithm on large dataset
5+
=======================================================
6+
7+
Tests RnnDBSCAN on a large dataset. Requires pandas.
8+
9+
"""
10+
print(__doc__)
11+
12+
import numpy as np
13+
from joblib import Memory
14+
15+
from sklearn_ann.cluster.rnn_dbscan import simple_rnn_dbscan_pipeline
16+
from sklearn.datasets import fetch_openml
17+
from sklearn import metrics
18+
19+
20+
# #############################################################################
21+
# Generate sample data
22+
def fetch_mnist():
23+
print("Downloading mnist_784")
24+
mnist = fetch_openml("mnist_784")
25+
return mnist.data / 255, mnist.target
26+
27+
28+
memory = Memory('./mnist')
29+
30+
X, y = memory.cache(fetch_mnist)()
31+
32+
33+
def run_rnn_dbscan(neighbor_transformer, n_neighbors, **kwargs):
34+
# #############################################################################
35+
# Compute RnnDBSCAN
36+
37+
pipeline = simple_rnn_dbscan_pipeline(neighbor_transformer, n_neighbors, **kwargs)
38+
labels = pipeline.fit_predict(X)
39+
db = pipeline.named_steps["rnndbscan"]
40+
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
41+
core_samples_mask[db.core_sample_indices_] = True
42+
43+
# Number of clusters in labels, ignoring noise if present.
44+
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
45+
n_noise_ = list(labels).count(-1)
46+
47+
print('Estimated number of clusters: %d' % n_clusters_)
48+
print('Estimated number of noise points: %d' % n_noise_)
49+
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
50+
print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
51+
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
52+
print("Adjusted Rand Index: %0.3f"
53+
% metrics.adjusted_rand_score(y, labels))
54+
print("Adjusted Mutual Information: %0.3f"
55+
% metrics.adjusted_mutual_info_score(y, labels))
56+
print("Silhouette Coefficient: %0.3f"
57+
% metrics.silhouette_score(X, labels))
58+
59+
60+
if __name__ == "__main__":
61+
import code
62+
print("Now you can import your chosen transformer_cls and run:")
63+
print("run_rnn_dbscan(transformer_cls, n_neighbors, **params)")
64+
print("e.g.")
65+
print("from sklearn_ann.kneighbors.pynndescent import PyNNDescentTransformer")
66+
print("run_rnn_dbscan(PyNNDescentTransformer, 10)")
67+
code.interact(local=locals())

examples/rnn_dbscan_simple.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
===================================
4+
Demo of RNN-DBSCAN clustering algorithm
5+
===================================
6+
7+
Finds core samples of high density and expands clusters from them.
8+
9+
Mostly copypasted from sklearn's DBSCAN example.
10+
11+
"""
12+
print(__doc__)
13+
14+
import numpy as np
15+
16+
from sklearn_ann.cluster.rnn_dbscan import RnnDBSCAN
17+
from sklearn import metrics
18+
from sklearn.datasets import make_blobs
19+
from sklearn.preprocessing import StandardScaler
20+
21+
22+
# #############################################################################
23+
# Generate sample data
24+
centers = [[1, 1], [-1, -1], [1, -1]]
25+
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
26+
random_state=0)
27+
28+
X = StandardScaler().fit_transform(X)
29+
30+
# #############################################################################
31+
# Compute DBSCAN
32+
db = RnnDBSCAN(n_neighbors=10).fit(X)
33+
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
34+
core_samples_mask[db.core_sample_indices_] = True
35+
labels = db.labels_
36+
37+
# Number of clusters in labels, ignoring noise if present.
38+
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
39+
n_noise_ = list(labels).count(-1)
40+
41+
print('Estimated number of clusters: %d' % n_clusters_)
42+
print('Estimated number of noise points: %d' % n_noise_)
43+
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
44+
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
45+
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
46+
print("Adjusted Rand Index: %0.3f"
47+
% metrics.adjusted_rand_score(labels_true, labels))
48+
print("Adjusted Mutual Information: %0.3f"
49+
% metrics.adjusted_mutual_info_score(labels_true, labels))
50+
print("Silhouette Coefficient: %0.3f"
51+
% metrics.silhouette_score(X, labels))
52+
53+
# #############################################################################
54+
# Plot result
55+
import matplotlib.pyplot as plt
56+
57+
# Black removed and is used for noise instead.
58+
unique_labels = set(labels)
59+
colors = [plt.cm.Spectral(each)
60+
for each in np.linspace(0, 1, len(unique_labels))]
61+
for k, col in zip(unique_labels, colors):
62+
if k == -1:
63+
# Black used for noise.
64+
col = [0, 0, 0, 1]
65+
66+
class_member_mask = (labels == k)
67+
68+
xy = X[class_member_mask & core_samples_mask]
69+
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
70+
markeredgecolor='k', markersize=14)
71+
72+
xy = X[class_member_mask & ~core_samples_mask]
73+
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
74+
markeredgecolor='k', markersize=6)
75+
76+
plt.title('Estimated number of clusters: %d' % n_clusters_)
77+
plt.show()

install_all.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
poetry install -E tests -E docs -E annoy -E faiss -E pynndescent
2+
poetry run pip install "git+https://github.com/nmslib/nmslib.git@fd969978ad49a7135b1a153826b5c460dc53d0ba#egg=nmslib&subdirectory=python_bindings"

0 commit comments

Comments
 (0)