ENH - implement Cox datafit with Breslow estimate (#157)

Badr-MOUFAD · mathurinm · web-flow · commit 399dfc6e0395 · 2023-05-30T17:04:01.000+02:00
Co-authored-by: mathurinm &lt;mathurin.massias@gmail.com&gt;
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -26,5 +26,8 @@ jobs:
         pip install .
         pip install statsmodels cvxopt
         pip install git+https://github.com/jolars/pyslope.git
+        # for testing Cox estimator
+        pip install lifelines
+        pip install pandas
     - name: Test with pytest
       run: pytest -v skglm/
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2022, scikit-learn-contrib
+Copyright (c) 2023, scikit-learn-contrib
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/doc/api.rst b/doc/api.rst
@@ -54,6 +54,7 @@ Datafits
 .. autosummary::
    :toctree: generated/
 
+   Cox
    Gamma
    Huber
    Logistic
diff --git a/skglm/datafits/__init__.py b/skglm/datafits/__init__.py
@@ -1,12 +1,12 @@
 from .base import BaseDatafit, BaseMultitaskDatafit
-from .single_task import Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma
+from .single_task import Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox
 from .multi_task import QuadraticMultiTask
 from .group import QuadraticGroup, LogisticGroup
 
 
 __all__ = [
     BaseDatafit, BaseMultitaskDatafit,
-    Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma,
+    Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox,
     QuadraticMultiTask,
     QuadraticGroup, LogisticGroup
 ]
diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
@@ -544,3 +544,105 @@ def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j):
 
     def intercept_update_self(self, y, Xw):
         pass
+
+
+class Cox(BaseDatafit):
+    r"""Cox datafit for survival analysis with Breslow estimate.
+
+    The datafit reads [1]
+
+    .. math::
+
+        1 / n_"samples" \sum_(i=1)^(n_"samples") -s_i \langle x_i, w \rangle
+        + \log (\sum_(j | y_j \geq y_i) e^{\langle x_i, w \rangle})
+
+    where :math:`s_i` indicates the sample censorship and :math:`tm`
+    is the vector recording the time of event occurrences.
+
+    Defining the matrix :math:`B` with
+    :math:`B_{i,j} = 1` if  :math:`tm_j \geq tm_i` and :math:`0` otherwise,
+    the datafit can be rewritten in the following compact form
+
+    .. math::
+
+        1 / n_"samples" \langle s, Xw \rangle
+        + 1 / n_"samples" \langle s, \log B e^{Xw} \rangle
+
+
+    Attributes
+    ----------
+    B : array-like, shape (n_samples, n_samples)
+        Matrix where every ``(i, j)`` entry (row, column) equals ``1``
+        if ``tm[j] >= tm[i]`` and `0` otherwise. This matrix is initialized
+        using the ``.initialize`` method.
+
+    References
+    ----------
+    .. [1] DY Lin. On the Breslow estimator.
+           Lifetime data analysis, 13:471–480, 2007.
+    """
+
+    def __init__(self):
+        pass
+
+    def get_spec(self):
+        return (
+            ('B', float64[:, ::1]),
+        )
+
+    def params_to_dict(self):
+        return dict()
+
+    def value(self, y, w, Xw):
+        """Compute the value of the datafit."""
+        tm, s = y
+        n_samples = Xw.shape[0]
+
+        out = -(s @ Xw) + s @ np.log(self.B @ np.exp(Xw))
+        return out / n_samples
+
+    def raw_grad(self, y, Xw):
+        r"""Compute gradient of datafit w.r.t. ``Xw``.
+
+        The raw gradient reads
+
+            (-s + exp_Xw * (B.T @ (s / B @ exp_Xw)) / n_samples
+        """
+        tm, s = y
+        n_samples = Xw.shape[0]
+
+        exp_Xw = np.exp(Xw)
+        B_exp_Xw = self.B @ exp_Xw
+
+        out = -s + exp_Xw * (self.B.T @ (s / B_exp_Xw))
+        return out / n_samples
+
+    def raw_hessian(self, y, Xw):
+        """Compute a diagonal upper bound of the datafit's Hessian w.r.t. ``Xw``.
+
+        The diagonal upper bound reads
+
+            exp_Xw * (B.T @ s / B_exp_Xw) / n_samples
+        """
+        tm, s = y
+        n_samples = Xw.shape[0]
+
+        exp_Xw = np.exp(Xw)
+        B_exp_Xw = self.B @ exp_Xw
+
+        out = exp_Xw * (self.B.T @ (s / B_exp_Xw))
+        return out / n_samples
+
+    def initialize(self, X, y):
+        """Initialize the datafit attributes."""
+        tm, s = y
+
+        tm_as_col = tm.reshape((-1, 1))
+        self.B = (tm >= tm_as_col).astype(X.dtype)
+
+    def initialize_sparse(self, X_data, X_indptr, X_indices, y):
+        """Initialize the datafit attributes in sparse dataset case."""
+        tm, s = y
+
+        tm_as_col = tm.reshape((-1, 1))
+        self.B = (tm >= tm_as_col).astype(X_data.dtype)
diff --git a/skglm/tests/test_datafits.py b/skglm/tests/test_datafits.py
@@ -1,14 +1,16 @@
 import numpy as np
+import scipy.optimize
 import pytest
 
 from sklearn.linear_model import HuberRegressor
 from numpy.testing import assert_allclose, assert_array_less
 
-from skglm.datafits import Huber, Logistic, Poisson, Gamma
+from skglm.datafits import Huber, Logistic, Poisson, Gamma, Cox
 from skglm.penalties import L1, WeightedL1
 from skglm.solvers import AndersonCD, ProxNewton
 from skglm import GeneralizedLinearEstimator
 from skglm.utils.data import make_correlated_data
+from skglm.utils.jit_compilation import compiled_clone
 
 
 @pytest.mark.parametrize('fit_intercept', [False, True])
@@ -114,5 +116,58 @@ def test_gamma():
     np.testing.assert_allclose(clf.coef_, gamma_results.params, rtol=1e-6)
 
 
+def test_cox():
+    rng = np.random.RandomState(1265)
+    n_samples, n_features = 10, 30
+
+    # generate data
+    X = rng.randn(n_samples, n_features)
+    tm = rng.choice(n_samples*n_features, size=n_samples, replace=True).astype(float)
+    s = rng.choice(2, size=n_samples).astype(float)
+    y = (tm, s)
+
+    # generate dummy w, Xw
+    w = rng.randn(n_features)
+    Xw = X @ w
+
+    # check datafit
+    cox_df = compiled_clone(Cox())
+
+    cox_df.initialize(X, (tm, s))
+    cox_df.value(y, w, Xw)
+
+    # perform test 10 times to consider truncation errors
+    # due to usage of finite differences to evaluate grad and Hessian
+    for _ in range(10):
+
+        # generate dummy w, Xw
+        w = rng.randn(n_features)
+        Xw = X @ w
+
+        # check gradient
+        np.testing.assert_allclose(
+            scipy.optimize.check_grad(
+                lambda x: cox_df.value(y, w, x),
+                lambda x: cox_df.raw_grad(y, x),
+                x0=Xw,
+                seed=rng
+            ),
+            0., atol=1e-6
+        )
+
+        # check hessian upper bound
+        # Hessian minus its upper bound must be negative semi definite
+        hess_upper_bound = np.diag(cox_df.raw_hessian(y, Xw))
+        hess = scipy.optimize.approx_fprime(
+            xk=Xw,
+            f=lambda x: cox_df.raw_grad(y, x),
+        )
+
+        positive_eig = np.linalg.eigh(hess - hess_upper_bound)[0]
+        positive_eig = positive_eig[positive_eig >= 0.]
+
+        np.testing.assert_allclose(positive_eig, 0., atol=1e-6)
+
+
 if __name__ == '__main__':
     pass
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
@@ -16,14 +16,18 @@
 
 from scipy.sparse import csc_matrix, issparse
 
-from skglm.utils.data import make_correlated_data
+from skglm.utils.data import make_correlated_data, make_dummy_survival_data
 from skglm.estimators import (
     GeneralizedLinearEstimator, Lasso, MultiTaskLasso, WeightedLasso, ElasticNet,
     MCPRegression, SparseLogisticRegression, LinearSVC)
-from skglm.datafits import Logistic, Quadratic, QuadraticSVC, QuadraticMultiTask
+from skglm.datafits import Logistic, Quadratic, QuadraticSVC, QuadraticMultiTask, Cox
 from skglm.penalties import L1, IndicatorBox, L1_plus_L2, MCPenalty, WeightedL1
 from skglm.solvers import AndersonCD
 
+import pandas as pd
+from skglm.solvers import ProxNewton
+from skglm.utils.jit_compilation import compiled_clone
+
 
 n_samples = 50
 n_tasks = 9
@@ -164,6 +168,90 @@ def test_mtl_path():
     np.testing.assert_allclose(coef_ours, coef_sk, rtol=1e-5)
 
 
+def test_CoxEstimator():
+    try:
+        from lifelines import CoxPHFitter
+    except ModuleNotFoundError:
+        pytest.xfail(
+            "Testing Cox Estimator requires `lifelines` packages\n"
+            "Run `pip install lifelines`"
+        )
+
+    reg = 1e-2
+    # norms of solutions differ when n_features > n_samples
+    n_samples, n_features = 100, 30
+    random_state = 1265
+
+    tm, s, X = make_dummy_survival_data(n_samples, n_features,
+                                        normalize=True, random_state=random_state)
+
+    # compute alpha_max
+    B = (tm >= tm[:, None]).astype(X.dtype)
+    grad_0 = -s + B.T @ (s / np.sum(B, axis=1))
+    alpha_max = norm(X.T @ grad_0, ord=np.inf) / n_samples
+
+    alpha = reg * alpha_max
+
+    # fit Cox using ProxNewton solver
+    datafit = compiled_clone(Cox())
+    penalty = compiled_clone(L1(alpha))
+
+    datafit.initialize(X, (tm, s))
+
+    w, *_ = ProxNewton(
+        fit_intercept=False, tol=1e-6, max_iter=50
+    ).solve(
+        X, (tm, s), datafit, penalty
+    )
+
+    # fit lifeline estimator
+    stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
+    df = pd.DataFrame(stacked_tm_s_X)
+
+    estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.)
+    estimator.fit(
+        df, duration_col=0, event_col=1,
+        fit_options={"max_steps": 10_000, "precision": 1e-12}
+    )
+    w_ll = estimator.params_.values
+
+    p_obj_skglm = datafit.value((tm, s), w, X @ w) + penalty.value(w)
+    p_obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+
+    # though norm of solution might differ
+    np.testing.assert_allclose(p_obj_skglm, p_obj_ll, atol=1e-6)
+
+
+def test_CoxEstimator_sparse():
+    reg = 1e-2
+    n_samples, n_features = 100, 30
+    X_density, random_state = 0.5, 1265
+
+    tm, s, X = make_dummy_survival_data(n_samples, n_features, X_density=X_density,
+                                        random_state=random_state)
+
+    # compute alpha_max
+    B = (tm >= tm[:, None]).astype(X.dtype)
+    grad_0 = -s + B.T @ (s / np.sum(B, axis=1))
+    alpha_max = norm(X.T @ grad_0, ord=np.inf) / n_samples
+
+    alpha = reg * alpha_max
+
+    # fit Cox using ProxNewton solver
+    datafit = compiled_clone(Cox())
+    penalty = compiled_clone(L1(alpha))
+
+    datafit.initialize_sparse(X.data, X.indptr, X.indices, (tm, s))
+
+    *_, stop_crit = ProxNewton(
+        fit_intercept=False, tol=1e-6, max_iter=50
+    ).solve(
+        X, (tm, s), datafit, penalty
+    )
+
+    np.testing.assert_allclose(stop_crit, 0., atol=1e-6)
+
+
 # Test if GeneralizedLinearEstimator returns the correct coefficients
 @pytest.mark.parametrize("Datafit, Penalty, Estimator, pen_args", [
     (Quadratic, L1, Lasso, [alpha]),
diff --git a/skglm/utils/data.py b/skglm/utils/data.py