scikit-learn-contrib
diff --git a/‎LICENSE
Lines changed: 27 additions & 0 deletions b/‎LICENSE
Lines changed: 27 additions & 0 deletions
diff --git a/‎README.rst
Lines changed: 31 additions & 0 deletions b/‎README.rst
Lines changed: 31 additions & 0 deletions
diff --git a/‎examples/rnn_dbscan_big.py
Lines changed: 67 additions & 0 deletions b/‎examples/rnn_dbscan_big.py
Lines changed: 67 additions & 0 deletions
diff --git a/‎examples/rnn_dbscan_simple.py
Lines changed: 77 additions & 0 deletions b/‎examples/rnn_dbscan_simple.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎install_all.sh
Lines changed: 2 additions & 0 deletions b/‎install_all.sh
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,27 @@
+Copyright (c) 2020, scikit-ann contributors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of project-template nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,31 @@
+.. -*- mode: rst -*-
+
+|ReadTheDocs|_
+
+.. |ReadTheDocs| image:: https://readthedocs.org/projects/sklearn-template/badge/?version=latest
+.. _ReadTheDocs: https://sklearn-template.readthedocs.io/en/latest/?badge=latest
+
+sklearn-ann
+===========
+
+**sklearn-ann** eases integration of approximate nearest neighbours
+libraries such as annoy, nmslib and faiss into your sklearn
+pipelines. It consists of:
+
+* ``Transformers`` conforming to the same interface as
+  ``KNeighborsTransformer`` which can be used to transform feature matrices
+  into sparse distance matrices for use by any estimator that can deal with
+  sparse distance matrices. Many, but not all, of scikit-learn's clustering and
+  manifold learning algorithms can work with this kind of input.
+* RNN-DBSCAN: a variant of DBSCAN based on reverse nearest
+  neighbours.
+
+Why? When do I want this?
+=========================
+
+The main scenarios in which this is needed is for performing
+*clustering or manifold learning or high dimensional data*. The
+reason is that currently the only neighbourhood algorithms which are
+build into scikit-learn are essentially the standard tree approaches
+to space partitioning: the ball tree and the K-D tree. These do not
+perform competitively in high dimensional spaces.
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+=======================================================
+Demo of RnnDBSCAN clustering algorithm on large dataset
+=======================================================
+
+Tests RnnDBSCAN on a large dataset. Requires pandas.
+
+"""
+print(__doc__)
+
+import numpy as np
+from joblib import Memory
+
+from sklearn_ann.cluster.rnn_dbscan import simple_rnn_dbscan_pipeline
+from sklearn.datasets import fetch_openml
+from sklearn import metrics
+
+
+# #############################################################################
+# Generate sample data
+def fetch_mnist():
+    print("Downloading mnist_784")
+    mnist = fetch_openml("mnist_784")
+    return mnist.data / 255, mnist.target
+
+
+memory = Memory('./mnist')
+
+X, y = memory.cache(fetch_mnist)()
+
+
+def run_rnn_dbscan(neighbor_transformer, n_neighbors, **kwargs):
+    # #############################################################################
+    # Compute RnnDBSCAN
+
+    pipeline = simple_rnn_dbscan_pipeline(neighbor_transformer, n_neighbors, **kwargs)
+    labels = pipeline.fit_predict(X)
+    db = pipeline.named_steps["rnndbscan"]
+    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+    core_samples_mask[db.core_sample_indices_] = True
+
+    # Number of clusters in labels, ignoring noise if present.
+    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+    n_noise_ = list(labels).count(-1)
+
+    print('Estimated number of clusters: %d' % n_clusters_)
+    print('Estimated number of noise points: %d' % n_noise_)
+    print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
+    print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
+    print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
+    print("Adjusted Rand Index: %0.3f"
+          % metrics.adjusted_rand_score(y, labels))
+    print("Adjusted Mutual Information: %0.3f"
+          % metrics.adjusted_mutual_info_score(y, labels))
+    print("Silhouette Coefficient: %0.3f"
+          % metrics.silhouette_score(X, labels))
+
+
+if __name__ == "__main__":
+    import code
+    print("Now you can import your chosen transformer_cls and run:")
+    print("run_rnn_dbscan(transformer_cls, n_neighbors, **params)")
+    print("e.g.")
+    print("from sklearn_ann.kneighbors.pynndescent import PyNNDescentTransformer")
+    print("run_rnn_dbscan(PyNNDescentTransformer, 10)")
+    code.interact(local=locals())
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+===================================
+Demo of RNN-DBSCAN clustering algorithm
+===================================
+
+Finds core samples of high density and expands clusters from them.
+
+Mostly copypasted from sklearn's DBSCAN example.
+
+"""
+print(__doc__)
+
+import numpy as np
+
+from sklearn_ann.cluster.rnn_dbscan import RnnDBSCAN
+from sklearn import metrics
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+
+
+# #############################################################################
+# Generate sample data
+centers = [[1, 1], [-1, -1], [1, -1]]
+X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
+                            random_state=0)
+
+X = StandardScaler().fit_transform(X)
+
+# #############################################################################
+# Compute DBSCAN
+db = RnnDBSCAN(n_neighbors=10).fit(X)
+core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+core_samples_mask[db.core_sample_indices_] = True
+labels = db.labels_
+
+# Number of clusters in labels, ignoring noise if present.
+n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+n_noise_ = list(labels).count(-1)
+
+print('Estimated number of clusters: %d' % n_clusters_)
+print('Estimated number of noise points: %d' % n_noise_)
+print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
+print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
+print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
+print("Adjusted Rand Index: %0.3f"
+      % metrics.adjusted_rand_score(labels_true, labels))
+print("Adjusted Mutual Information: %0.3f"
+      % metrics.adjusted_mutual_info_score(labels_true, labels))
+print("Silhouette Coefficient: %0.3f"
+      % metrics.silhouette_score(X, labels))
+
+# #############################################################################
+# Plot result
+import matplotlib.pyplot as plt
+
+# Black removed and is used for noise instead.
+unique_labels = set(labels)
+colors = [plt.cm.Spectral(each)
+          for each in np.linspace(0, 1, len(unique_labels))]
+for k, col in zip(unique_labels, colors):
+    if k == -1:
+        # Black used for noise.
+        col = [0, 0, 0, 1]
+
+    class_member_mask = (labels == k)
+
+    xy = X[class_member_mask & core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=14)
+
+    xy = X[class_member_mask & ~core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=6)
+
+plt.title('Estimated number of clusters: %d' % n_clusters_)
+plt.show()
@@ -0,0 +1,2 @@
+poetry install -E tests -E docs -E annoy -E faiss -E pynndescent
+poetry run pip install "git+https://github.com/nmslib/nmslib.git@fd969978ad49a7135b1a153826b5c460dc53d0ba#egg=nmslib&subdirectory=python_bindings"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+poetry install -E tests -E docs -E annoy -E faiss -E pynndescent`
	`2`	`+poetry run pip install "git+https://github.com/nmslib/nmslib.git@fd969978ad49a7135b1a153826b5c460dc53d0ba#egg=nmslib&subdirectory=python_bindings"`