Speed tweaks for Robust algorithms & make examples faster (#77)

TimotheeMathieu · rth · web-flow · commit 0cee1e6da0f1 · 2020-11-16T15:58:29.000+01:00
* Creation of one class for each task classification, regression and clustering. Update doc, test, examples. Fix typo * Update plot clustering * Update test_common * Fix test and corss_val * fix doc function RobustWeightedEstimators * fix _RobustWeightedEstimator doc and RobustWeightedKMeans doc * Black reformatted * Add Huber loss * reformat black * Changes suggested by @rth in PR * Add stop criterion and change for faster examples * Subsample clustering * fix clustering example * cython kmeans loss * fix black * type conversion cython * fix setup.py windows * fix black Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
diff --git a/examples/plot_clustering.py b/examples/plot_clustering.py
@@ -55,15 +55,15 @@
 )
 
 
-for n_samples in [300, 3000]:
+for n_samples in [300, 600]:
     # Construct the dataset
     X, labels_true = make_blobs(
         n_samples=n_samples, centers=centers, cluster_std=0.4, random_state=rng
     )
 
     # Change the first 1% entries to outliers
     for f in range(int(n_samples / 100)):
-        X[f] = [20, 3] + rng.normal(size=2) * 0.1
+        X[f] = [10, 3] + rng.normal(size=2) * 0.1
     # Shuffle the data so that we don't know where the outlier is.
     X = shuffle(X, random_state=rng)
 
@@ -73,7 +73,7 @@
         eta0=0.01,
         weighting="mom",
         max_iter=100,
-        k=int(n_samples / 50),
+        k=int(n_samples / 20),
         random_state=rng,
     )
     bandwidth = cluster.estimate_bandwidth(X, 0.2)
diff --git a/examples/plot_robust_classification_diabete.py b/examples/plot_robust_classification_diabete.py
@@ -3,9 +3,8 @@
 ======================================================================
 A demo of Robust Classification on real dataset "diabetes" from OpenML
 ======================================================================
-In this example we compare the RobustWeightedCLassifier 
+In this example we compare the RobustWeightedCLassifier
 for classification on the real dataset "diabetes".
-WARNING: running this example can take some time (<1hour).
 We only compare the estimator with SGDClassifier as there is no robust
 classification estimator in scikit-learn.
 """
@@ -18,6 +17,10 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import RobustScaler
 
+import warnings
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
 X, y = fetch_openml(name="diabetes", return_X_y=True)
 
 # replace the label names with 0 or 1
diff --git a/examples/plot_robust_regression_california_houses.py b/examples/plot_robust_regression_california_houses.py
@@ -5,7 +5,6 @@
 ================================================================
 In this example we compare the RobustWeightedRegressor to other scikit-learn
 regressors on the real dataset california housing.
-WARNING: running this example can take some time (<1 hour on recent computer).
 
 One of the main point of this example is the importance of taking into account
 outliers in the test dataset when dealing with real datasets.
@@ -36,6 +35,9 @@ def quadratic_loss(est, X, y, X_test, y_test):
 
 
 X, y = fetch_california_housing(return_X_y=True)
+# Sub-sample for faster computation.
+X = X[:1000]
+y = y[:1000]
 
 # Scale the dataset with sklearn RobustScaler (important for this algorithm)
 X = RobustScaler().fit_transform(X)
@@ -46,25 +48,17 @@ def quadratic_loss(est, X, y, X_test, y_test):
 estimators = [
     (
         "SGD",
-        SGDRegressor(
-            learning_rate="adaptive",
-            eta0=1e-6,
-            max_iter=2000,
-            n_iter_no_change=100,
-        ),
+        SGDRegressor(learning_rate="adaptive", eta0=1e-2),
     ),
     (
         "RobustWeightedRegressor",
         RobustWeightedRegressor(
             weighting="huber",
-            c=0.5,
-            eta0=1e-6,
-            max_iter=500,
+            c=0.01,
+            eta0=1e-2,
             sgd_args={
-                "max_iter": 1000,
-                "n_iter_no_change": 100,
                 "learning_rate": "adaptive",
-                "eta0": 1e-6,
+                "eta0": 1e-3,
             },
         ),
     ),
@@ -95,7 +89,7 @@ def quadratic_loss(est, X, y, X_test, y_test):
         res[i, f, 0] = np.mean(cv)
         res[i, f, 1] = np.median(cv)
 
-fig, (axe1, axe2) = plt.subplots(2, 1)
+fig, (axe1, axe2) = plt.subplots(1, 2)
 names = [name for name, est in estimators]
 
 axe1.boxplot(res[:, :, 0].T, labels=names)
diff --git a/setup.py b/setup.py
@@ -54,6 +54,9 @@
         "matplotlib",
     ],
 }
+libraries = []
+if os.name == "posix":
+    libraries.append("m")
 
 args = {
     "ext_modules": cythonize(
@@ -63,6 +66,12 @@
                 ["sklearn_extra/utils/_cyfht.pyx"],
                 include_dirs=[np.get_include()],
             ),
+            Extension(
+                "sklearn_extra.robust._robust_weighted_estimator_helper",
+                ["sklearn_extra/robust/_robust_weighted_estimator_helper.pyx"],
+                include_dirs=[np.get_include()],
+                libraries=libraries,
+            ),
             Extension(
                 "sklearn_extra.cluster._commonnn_inner",
                 ["sklearn_extra/cluster/_commonnn_inner.pyx"],
diff --git a/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx b/sklearn_extra/robust/_robust_weighted_estimator_helper.pyx
@@ -0,0 +1,75 @@
+# cython: infer_types=True
+# Fast swap step in PAM algorithm for k_medoid.
+# Author: Timothée Mathieu
+# License: 3-clause BSD
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+from sklearn.utils.extmath import row_norms
+from cython cimport floating
+
+# Modified from sklearn.cluster._k_means_fast.pyx
+np.import_array()
+
+cdef floating _euclidean_dense_dense(
+        floating* a,  # IN
+        floating* b,  # IN
+        int n_features) nogil:
+    """Euclidean distance between a dense and b dense"""
+    cdef:
+        int i
+        int n = n_features // 4
+        int rem = n_features % 4
+        floating result = 0
+
+    # We manually unroll the loop for better cache optimization.
+    for i in range(n):
+        result += ((a[0] - b[0]) * (a[0] - b[0])
+                  +(a[1] - b[1]) * (a[1] - b[1])
+                  +(a[2] - b[2]) * (a[2] - b[2])
+                  +(a[3] - b[3]) * (a[3] - b[3]))
+        a += 4; b += 4
+
+    for i in range(rem):
+        result += (a[i] - b[i]) * (a[i] - b[i])
+
+    return result
+
+
+
+cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X,
+                                        int[:] labels):
+    """Compute inertia
+
+    squared distancez between each sample and its assigned center.
+    """
+    if floating is float:
+        dtype = np.float32
+    elif floating is double:
+        dtype = np.double
+
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j
+        int n_classes = len(np.unique(labels))
+        np.ndarray[floating, ndim=2] centers = np.zeros([n_classes,
+                                                         n_features],
+                                                         dtype = dtype)
+        np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int)
+        np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype)
+    for i in range(n_samples):
+        for j in range(n_features):
+            centers[labels[i], j] += X[i, j]
+        num_in_cluster[labels[i]] += 1
+
+    for i in range(n_classes):
+        for j in range(n_features):
+            centers[i, j] /= num_in_cluster[i]
+
+    for i in range(n_samples):
+        j = labels[i]
+        inertias[i] = _euclidean_dense_dense(&X[i, 0], &centers[j, 0], n_features)
+    return inertias
diff --git a/sklearn_extra/robust/robust_weighted_estimator.py b/sklearn_extra/robust/robust_weighted_estimator.py