DOC improve logreg example with figure and timings (#228)

mathurinm · web-flow · commit b1b505a98242 · 2022-04-07T11:27:48.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -14,12 +14,16 @@ celer.egg-info
 # build
 build
 
+# generated doc
+doc/_build/
+doc/auto_examples/
+doc/generated
 
 # cache
 .pytest_cache
 __pycache__
 
-doc/*
+
 coverage/*
 .coverage
 
diff --git a/README.rst b/README.rst
@@ -3,16 +3,17 @@ celer
 
 |image0| |image1|
 
-Fast algorithm to solve Lasso-like problems with dual extrapolation. Currently, the package handles the following problems:
+Fast algorithm to solve Lasso-like problems with dual extrapolation, under a scikit-learn API.
+The solvers used allow for solving large scale problems with millions of features, up to 100 times faster than scikit-learn.
+Currently, the package handles the following problems:
 
 - Lasso
 - weighted Lasso
 - Sparse Logistic regression
-- Group Lasso
+- weighted Group Lasso
 - Multitask Lasso
 
 The estimators follow the scikit-learn API, come with automated parallel cross-validation, and support both sparse and dense data, with optionally feature centering, normalization, and unpenalized intercept fitting.
-The solvers used allow for solving large scale problems with millions of features, up to 100 times faster than scikit-learn.
 
 Documentation
 =============
diff --git a/doc/conf.py b/doc/conf.py
@@ -192,14 +192,21 @@
      'Miscellaneous'),
 ]
 
+
+intersphinx_mapping = {
+    # 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
+    # 'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
+    'matplotlib': ('https://matplotlib.org/', None),
+    'sklearn': ('http://scikit-learn.org/stable', None),
+}
+
 sphinx_gallery_conf = {
-    'doc_module': ('celer',),
+    'doc_module': ('celer', 'sklearn'),
     'reference_url': dict(celer=None),
     'examples_dirs': '../examples',
     'gallery_dirs': 'auto_examples',
     'reference_url': {
-        'numpy': 'http://docs.scipy.org/doc/numpy-1.9.1',
-        'scipy': 'http://docs.scipy.org/doc/scipy-0.17.0/reference',
+        'celer': None,
     }
 }
 
diff --git a/doc/index.rst b/doc/index.rst
@@ -6,17 +6,19 @@
 Celer
 ======
 
-This is a library to run the Constraint Elimination for the Lasso with Extrapolated Residuals (Celer) algorithm [1].
+Celer is a library exposing many scikit-learn like sparse models missing from scikit-learn.
+It estimates these models with the Constraint Elimination for the Lasso with Extrapolated Residuals (Celer) algorithm [1].
+The solvers used allow for solving large scale problems with millions of features, **up to 100 times faster than scikit-learn**.
+
 Currently, the package handles the following problems:
 
 - Lasso
 - weighted Lasso
-- Sparse Logistic regression
-- Group Lasso
+- sparse Logistic regression
+- weighted Group Lasso
 - Multitask Lasso.
 
 The estimators follow the scikit-learn API, come with automated parallel cross-validation, and support both sparse and dense data, with optionally feature centering, normalization, and unpenalized intercept fitting.
-The solvers used allow for solving large scale problems with millions of features, up to 100 times faster than scikit-learn.
 
 
 Install the released version
diff --git a/examples/plot_logreg_timings.py b/examples/plot_logreg_timings.py
@@ -1,24 +1,67 @@
 """
-===============================================================
-Use LogisticRegression class with Celer and Prox-Newton solvers
-===============================================================
+==================================================================
+Compare LogisticRegression solver with sklearn's liblinear backend
+==================================================================
 """
 
+import time
+import warnings
 import numpy as np
 from numpy.linalg import norm
+import matplotlib.pyplot as plt
 from sklearn import linear_model
+from libsvmdata import fetch_libsvm
 
 from celer import LogisticRegression
-from celer.datasets import fetch_ml_uci
 
-dataset = "gisette_train"
-X, y = fetch_ml_uci(dataset)
+warnings.filterwarnings("ignore", message="Objective did not converge")
+warnings.filterwarnings("ignore", message="Liblinear failed to converge")
+
+X, y = fetch_libsvm("news20.binary")
 
 C_min = 2 / norm(X.T @ y, ord=np.inf)
-C = 5 * C_min
-clf = LogisticRegression(C=C, verbose=1, solver="celer-pn", tol=1e0).fit(X, y)
-w_celer = clf.coef_.ravel()
+C = 20 * C_min
+
+
+def pobj_logreg(w):
+    return np.sum(np.log(1 + np.exp(-y * (X @ w)))) + 1. / C * norm(w, ord=1)
+
+
+pobj_celer = []
+t_celer = []
+
+for n_iter in range(10):
+    t0 = time.time()
+    clf = LogisticRegression(
+        C=C, solver="celer-pn", max_iter=n_iter, tol=0).fit(X, y)
+    t_celer.append(time.time() - t0)
+    w_celer = clf.coef_.ravel()
+    pobj_celer.append(pobj_logreg(w_celer))
+
+pobj_celer = np.array(pobj_celer)
+
+
+pobj_libl = []
+t_libl = []
+
+for n_iter in np.arange(0, 50, 10):
+    t0 = time.time()
+    clf = linear_model.LogisticRegression(
+        C=C, solver="liblinear", penalty='l1', fit_intercept=False,
+        max_iter=n_iter, random_state=0, tol=1e-10).fit(X, y)
+    t_libl.append(time.time() - t0)
+    w_libl = clf.coef_.ravel()
+    pobj_libl.append(pobj_logreg(w_libl))
+
+pobj_libl = np.array(pobj_libl)
+
+p_star = min(pobj_celer.min(), pobj_libl.min())
 
-clf = linear_model.LogisticRegression(
-    C=C, solver="liblinear", penalty='l1', fit_intercept=False).fit(X, y)
-w_lib = clf.coef_.ravel()
+plt.close("all")
+fig = plt.figure(figsize=(4, 2), constrained_layout=True)
+plt.semilogy(t_celer, pobj_celer - p_star, label="Celer-PN")
+plt.semilogy(t_libl, pobj_libl - p_star, label="liblinear")
+plt.legend()
+plt.xlabel("Time (s)")
+plt.ylabel("objective suboptimality")
+plt.show(block=False)