Merge remote-tracking branch 'origin/master'

lmcinnes · lmcinnes · commit 7a7a0defaedd · 2018-07-05T10:43:44.000-04:00
diff --git a/README.rst b/README.rst
@@ -268,7 +268,7 @@ If you have used this codebase in a scientific publication and wish to cite it,
     In: Journal of Open Source Software, The Open Journal, volume 2, number 11.
     2017
     
-To refernece the high performance algorithm developed in this library please cite our paper in ICDMW 2017 proceedings.
+To reference the high performance algorithm developed in this library please cite our paper in ICDMW 2017 proceedings.
 
     McInnes L, Healy J. *Accelerated Hierarchical Density Based Clustering* 
     In: 2017 IEEE International Conference on Data Mining Workshops (ICDMW), IEEE, pp 33-42.
diff --git a/docs/comparing_clustering_algorithms.rst b/docs/comparing_clustering_algorithms.rst
@@ -171,14 +171,16 @@ multiple different clusterings. This does not engender much confidence
 in any individual clustering that may result.
 
 So, in summary, here's how K-Means seems to stack up against out
-desiderata: \* **Don't be wrong!**: K-means is going to throw points
+desiderata: 
+- **Don't be wrong!**: K-means is going to throw points
 into clusters whether they belong or not; it also assumes you clusters
-are globular. K-Means scores very poorly on this point. \* **Intuitive
-parameters**: If you have a good intuition for how many clusters the
+are globular. K-Means scores very poorly on this point.
+- **Intuitive parameters**: If you have a good intuition for how many clusters the
 dataset your exploring has then great, otherwise you might have a
-problem. \* **Stability**: Hopefully the clustering is stable for your
-data. Best to have many runs and check though. \* **Performance**: This
-is K-Means big win. It's a simple algorithm and with the right tricks
+problem. 
+- **Stability**: Hopefully the clustering is stable for your
+data. Best to have many runs and check though. 
+- **Performance**: This is K-Means big win. It's a simple algorithm and with the right tricks
 and optimizations can be made exceptionally efficient. There are few
 algorithms that can compete with K-Means for performance. If you have
 truly huge data then K-Means might be your only option.
diff --git a/docs/parameter_selection.rst b/docs/parameter_selection.rst
@@ -112,12 +112,13 @@ of 15.
 As you can see this results in us recovering something much closer to
 our original clustering, only now with some of the smaller clusters
 pruned out. Thus ``min_cluster_size`` does behave more closely to our
-intuitions, but only if we fix ``min_samples``. If you wish to explore
-different ``min_cluster_size`` settings with a fixed ``min_samples``
-value, especially for larger dataset sizes, you can cache the hard
-computation, and recompute only the relatively cheap flat cluster
-extraction using the ``memory`` parameter, which makes use of
-`joblib <https://pythonhosted.org/joblib/>`_
+intuitions, but only if we fix ``min_samples``. 
+
+    If you wish to explore different ``min_cluster_size`` settings with 
+    a fixed ``min_samples`` value, especially for larger dataset sizes, 
+    you can cache the hard computation, and recompute only the relatively
+    cheap flat cluster extraction using the ``memory`` parameter, which 
+    makes use of `joblib <https://pythonhosted.org/joblib/>`_
 
 .. _min_samples_label:
 
@@ -134,6 +135,9 @@ to progressively more dense areas. We can see this in practice by
 leaving the ``min_cluster_size`` at 60, but reducing ``min_samples`` to
 1.
 
+    Note: adjusting ``min_samples`` will result in recomputing the **hard 
+    comptuation** of the single linkage tree.
+    
 .. code:: python
 
     clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=1).fit(data)
@@ -181,6 +185,9 @@ clustering is. By default ``alpha`` is set to 1.0. Increasing ``alpha``
 will make the clustering more conservative, but on a much tighter scale,
 as we can see by setting ``alpha`` to 1.3.
 
+    Note: adjusting ``alpha`` will result in recomputing the **hard 
+    comptuation** of the single linkage tree.
+
 .. code:: python
 
     clusterer = hdbscan.HDBSCAN(min_cluster_size=60, min_samples=15, alpha=1.3).fit(data)
diff --git a/hdbscan/dist_metrics.pyx b/hdbscan/dist_metrics.pyx
@@ -1108,36 +1108,39 @@ cdef class ArccosDistance(DistanceMetric):
 #
 cdef class PyFuncDistance(DistanceMetric):
     """PyFunc Distance
-
     A user-defined distance
-
     Parameters
     ----------
     func : function
         func should take two numpy arrays as input, and return a distance.
     """
     def __init__(self, func, **kwargs):
         self.func = func
-        x = np.random.random(10)
-        try:
-            d = self.func(x, x, **kwargs)
-        except TypeError:
-            raise ValueError("func must be a callable taking two arrays")
-
-        try:
-            d = float(d)
-        except TypeError:
-            raise ValueError("func must return a float")
-
         self.kwargs = kwargs
 
+    # in cython < 0.26, GIL was required to be acquired during definition of
+    # the function and inside the body of the function. This behaviour is not
+    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
+    # only way to be back compatible is to inherit `dist` from the base class
+    # without GIL and called an inline `_dist` which acquire GIL.
     cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
-                             ITYPE_t size) except -1 with gil:
+                             ITYPE_t size) nogil except -1:
+        return self._dist(x1, x2, size)
+
+    cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2,
+                              ITYPE_t size) except -1 with gil:
         cdef np.ndarray x1arr
         cdef np.ndarray x2arr
         x1arr = _buffer_to_ndarray(x1, size)
         x2arr = _buffer_to_ndarray(x2, size)
-        return self.func(x1arr, x2arr, **self.kwargs)
+        d = self.func(x1arr, x2arr, **self.kwargs)
+        try:
+            # Cython generates code here that results in a TypeError
+            # if d is the wrong type.
+            return d
+        except TypeError:
+            raise TypeError("Custom distance function must accept two "
+                            "vectors and return a float.")
 
 
 cdef inline double fmax(double a, double b) nogil:
diff --git a/hdbscan/hdbscan_.py b/hdbscan/hdbscan_.py
@@ -503,6 +503,9 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
         min_samples = 1
 
     if algorithm != 'best':
+        if metric != 'precomputed' and issparse(X) and metric != 'generic':
+            raise ValueError("Sparse data matrices only support algorithm 'generic'.")
+                  
         if algorithm == 'generic':
             (single_linkage_tree,
              result_min_span_tree) = memory.cache(

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ If you have used this codebase in a scientific publication and wish to cite it,`
`268`	`268`	`In: Journal of Open Source Software, The Open Journal, volume 2, number 11.`
`269`	`269`	`2017`
`270`	`270`
`271`		`-To refernece the high performance algorithm developed in this library please cite our paper in ICDMW 2017 proceedings.`
	`271`	`+To reference the high performance algorithm developed in this library please cite our paper in ICDMW 2017 proceedings.`
`272`	`272`
`273`	`273`	`McInnes L, Healy J. Accelerated Hierarchical Density Based Clustering`
`274`	`274`	`In: 2017 IEEE International Conference on Data Mining Workshops (ICDMW), IEEE, pp 33-42.`