Skip to content

Commit 0cee1e6

Browse files
TimotheeMathieurth
andauthored
Speed tweaks for Robust algorithms & make examples faster (#77)
* Creation of one class for each task classification, regression and clustering. Update doc, test, examples. Fix typo * Update plot clustering * Update test_common * Fix test and corss_val * fix doc function RobustWeightedEstimators * fix _RobustWeightedEstimator doc and RobustWeightedKMeans doc * Black reformatted * Add Huber loss * reformat black * Changes suggested by @rth in PR * Add stop criterion and change for faster examples * Subsample clustering * fix clustering example * cython kmeans loss * fix black * type conversion cython * fix setup.py windows * fix black Co-authored-by: Roman Yurchak <[email protected]>
1 parent 1f626fa commit 0cee1e6

File tree

6 files changed

+179
-48
lines changed

6 files changed

+179
-48
lines changed

examples/plot_clustering.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,15 @@
5555
)
5656

5757

58-
for n_samples in [300, 3000]:
58+
for n_samples in [300, 600]:
5959
# Construct the dataset
6060
X, labels_true = make_blobs(
6161
n_samples=n_samples, centers=centers, cluster_std=0.4, random_state=rng
6262
)
6363

6464
# Change the first 1% entries to outliers
6565
for f in range(int(n_samples / 100)):
66-
X[f] = [20, 3] + rng.normal(size=2) * 0.1
66+
X[f] = [10, 3] + rng.normal(size=2) * 0.1
6767
# Shuffle the data so that we don't know where the outlier is.
6868
X = shuffle(X, random_state=rng)
6969

@@ -73,7 +73,7 @@
7373
eta0=0.01,
7474
weighting="mom",
7575
max_iter=100,
76-
k=int(n_samples / 50),
76+
k=int(n_samples / 20),
7777
random_state=rng,
7878
)
7979
bandwidth = cluster.estimate_bandwidth(X, 0.2)

examples/plot_robust_classification_diabete.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
======================================================================
44
A demo of Robust Classification on real dataset "diabetes" from OpenML
55
======================================================================
6-
In this example we compare the RobustWeightedCLassifier
6+
In this example we compare the RobustWeightedCLassifier
77
for classification on the real dataset "diabetes".
8-
WARNING: running this example can take some time (<1hour).
98
We only compare the estimator with SGDClassifier as there is no robust
109
classification estimator in scikit-learn.
1110
"""
@@ -18,6 +17,10 @@
1817
from sklearn.model_selection import cross_val_score
1918
from sklearn.preprocessing import RobustScaler
2019

20+
import warnings
21+
22+
warnings.simplefilter(action="ignore", category=FutureWarning)
23+
2124
X, y = fetch_openml(name="diabetes", return_X_y=True)
2225

2326
# replace the label names with 0 or 1

examples/plot_robust_regression_california_houses.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
================================================================
66
In this example we compare the RobustWeightedRegressor to other scikit-learn
77
regressors on the real dataset california housing.
8-
WARNING: running this example can take some time (<1 hour on recent computer).
98
109
One of the main point of this example is the importance of taking into account
1110
outliers in the test dataset when dealing with real datasets.
@@ -36,6 +35,9 @@ def quadratic_loss(est, X, y, X_test, y_test):
3635

3736

3837
X, y = fetch_california_housing(return_X_y=True)
38+
# Sub-sample for faster computation.
39+
X = X[:1000]
40+
y = y[:1000]
3941

4042
# Scale the dataset with sklearn RobustScaler (important for this algorithm)
4143
X = RobustScaler().fit_transform(X)
@@ -46,25 +48,17 @@ def quadratic_loss(est, X, y, X_test, y_test):
4648
estimators = [
4749
(
4850
"SGD",
49-
SGDRegressor(
50-
learning_rate="adaptive",
51-
eta0=1e-6,
52-
max_iter=2000,
53-
n_iter_no_change=100,
54-
),
51+
SGDRegressor(learning_rate="adaptive", eta0=1e-2),
5552
),
5653
(
5754
"RobustWeightedRegressor",
5855
RobustWeightedRegressor(
5956
weighting="huber",
60-
c=0.5,
61-
eta0=1e-6,
62-
max_iter=500,
57+
c=0.01,
58+
eta0=1e-2,
6359
sgd_args={
64-
"max_iter": 1000,
65-
"n_iter_no_change": 100,
6660
"learning_rate": "adaptive",
67-
"eta0": 1e-6,
61+
"eta0": 1e-3,
6862
},
6963
),
7064
),
@@ -95,7 +89,7 @@ def quadratic_loss(est, X, y, X_test, y_test):
9589
res[i, f, 0] = np.mean(cv)
9690
res[i, f, 1] = np.median(cv)
9791

98-
fig, (axe1, axe2) = plt.subplots(2, 1)
92+
fig, (axe1, axe2) = plt.subplots(1, 2)
9993
names = [name for name, est in estimators]
10094

10195
axe1.boxplot(res[:, :, 0].T, labels=names)

setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
"matplotlib",
5555
],
5656
}
57+
libraries = []
58+
if os.name == "posix":
59+
libraries.append("m")
5760

5861
args = {
5962
"ext_modules": cythonize(
@@ -63,6 +66,12 @@
6366
["sklearn_extra/utils/_cyfht.pyx"],
6467
include_dirs=[np.get_include()],
6568
),
69+
Extension(
70+
"sklearn_extra.robust._robust_weighted_estimator_helper",
71+
["sklearn_extra/robust/_robust_weighted_estimator_helper.pyx"],
72+
include_dirs=[np.get_include()],
73+
libraries=libraries,
74+
),
6675
Extension(
6776
"sklearn_extra.cluster._commonnn_inner",
6877
["sklearn_extra/cluster/_commonnn_inner.pyx"],
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# cython: infer_types=True
2+
# Fast swap step in PAM algorithm for k_medoid.
3+
# Author: Timothée Mathieu
4+
# License: 3-clause BSD
5+
6+
cimport cython
7+
import numpy as np
8+
cimport numpy as np
9+
10+
from sklearn.utils.extmath import row_norms
11+
from cython cimport floating
12+
13+
# Modified from sklearn.cluster._k_means_fast.pyx
14+
np.import_array()
15+
16+
cdef floating _euclidean_dense_dense(
17+
floating* a, # IN
18+
floating* b, # IN
19+
int n_features) nogil:
20+
"""Euclidean distance between a dense and b dense"""
21+
cdef:
22+
int i
23+
int n = n_features // 4
24+
int rem = n_features % 4
25+
floating result = 0
26+
27+
# We manually unroll the loop for better cache optimization.
28+
for i in range(n):
29+
result += ((a[0] - b[0]) * (a[0] - b[0])
30+
+(a[1] - b[1]) * (a[1] - b[1])
31+
+(a[2] - b[2]) * (a[2] - b[2])
32+
+(a[3] - b[3]) * (a[3] - b[3]))
33+
a += 4; b += 4
34+
35+
for i in range(rem):
36+
result += (a[i] - b[i]) * (a[i] - b[i])
37+
38+
return result
39+
40+
41+
42+
cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X,
43+
int[:] labels):
44+
"""Compute inertia
45+
46+
squared distancez between each sample and its assigned center.
47+
"""
48+
if floating is float:
49+
dtype = np.float32
50+
elif floating is double:
51+
dtype = np.double
52+
53+
cdef:
54+
int n_samples = X.shape[0]
55+
int n_features = X.shape[1]
56+
int i, j
57+
int n_classes = len(np.unique(labels))
58+
np.ndarray[floating, ndim=2] centers = np.zeros([n_classes,
59+
n_features],
60+
dtype = dtype)
61+
np.ndarray[long] num_in_cluster = np.zeros(n_classes, dtype = int)
62+
np.ndarray[floating] inertias = np.zeros(n_samples, dtype = dtype)
63+
for i in range(n_samples):
64+
for j in range(n_features):
65+
centers[labels[i], j] += X[i, j]
66+
num_in_cluster[labels[i]] += 1
67+
68+
for i in range(n_classes):
69+
for j in range(n_features):
70+
centers[i, j] /= num_in_cluster[i]
71+
72+
for i in range(n_samples):
73+
j = labels[i]
74+
inertias[i] = _euclidean_dense_dense(&X[i, 0], &centers[j, 0], n_features)
75+
return inertias

0 commit comments

Comments
 (0)