scikit-learn-contrib
diff --git a/‎examples/plot_survival_analysis.py
Lines changed: 99 additions & 5 deletions b/‎examples/plot_survival_analysis.py
Lines changed: 99 additions & 5 deletions
diff --git a/‎skglm/datafits/single_task.py
Lines changed: 99 additions & 40 deletions b/‎skglm/datafits/single_task.py
Lines changed: 99 additions & 40 deletions
diff --git a/‎skglm/tests/test_datafits.py
Lines changed: 3 additions & 2 deletions b/‎skglm/tests/test_datafits.py
Lines changed: 3 additions & 2 deletions
@@ -73,15 +73,17 @@
 datafit.initialize(X, (tm, s))
 
 # init solver
-solver = ProxNewton(fit_intercept=False, max_iter=50,)
+solver = ProxNewton(fit_intercept=False, max_iter=50)
 
 # solve the problem
 w_sk = solver.solve(X, (tm, s), datafit, penalty)[0]
 
 # %%
 # For this data a regularization value a relatively sparse solution is found:
-print(f"Number of nonzero coefficients in solution: {(w_sk != 0).sum()} out of {len(w_sk)}.")
-
+print(
+    "Number of nonzero coefficients in solution: "
+    f"{(w_sk != 0).sum()} out of {len(w_sk)}."
+)
 
 # %%
 # Let's solve the problem with ``lifelines`` through its ``CoxPHFitter``
@@ -195,11 +197,103 @@
 ax.set_ylabel("objective suboptimality")
 _ = ax.set_xlabel("time in seconds")
 
-
-
 # %%
 # According to printed ratio, using ``skglm`` we get the same result as ``lifelines``
 # with more than x100 less time!
 speed_up = records["lifelines"]["times"][-1] / records["skglm"]["times"][-1]
 print(f"speed up ratio: {speed_up:.0f}")
 
+# %%
+# Efron estimate
+# --------------
+#
+# The previous results, namely closeness of solutions and timings,
+# can be extended to the case of handling tied observation with the Efron estimate.
+#
+# Let's start by generating data with tied observations. This can be achieved
+# by passing in a ``with_ties=True`` to ``make_dummy_survival_data`` function.
+tm, s, X = make_dummy_survival_data(
+    n_samples, n_features,
+    normalize=True,
+    with_ties=True,
+    random_state=0
+)
+
+# check the data has tied observations
+print(f"Number of unique times {len(np.unique(tm))} out of {n_samples}")
+
+# %%
+# It is straightforward to fit an :math:`\ell_1` Cox estimator with the Efron estimate.
+# We only need to pass in ``use_efron=True`` to the ``Cox`` datafit.
+
+# ensure using Efron estimate
+datafit = compiled_clone(Cox(use_efron=True))
+datafit.initialize(X, (tm, s))
+
+# solve the problem
+solver = ProxNewton(fit_intercept=False, max_iter=50)
+w_sk = solver.solve(X, (tm, s), datafit, penalty)[0]
+
+# %%
+# Again a relatively sparse solution is found:
+print(
+    "Number of nonzero coefficients in solution: "
+    f"{(w_sk != 0).sum()} out of {len(w_sk)}."
+)
+
+# %%
+# Let's do the same with ``lifelines`` and compare the results
+
+# format data
+stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
+df = pd.DataFrame(stacked_tm_s_X)
+
+# fit lifelines estimator on the new data
+lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.).fit(
+    df,
+    duration_col=0,
+    event_col=1
+)
+w_ll = lifelines_estimator.params_.values
+
+# Check that both solvers find solutions with the same objective value
+obj_sk = datafit.value((tm, s), w_sk, X @ w_sk) + penalty.value(w_sk)
+obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+
+print(f"Objective skglm: {obj_sk:.6f}")
+print(f"Objective lifelines: {obj_ll:.6f}")
+print(f"Difference: {(obj_sk - obj_ll):.2e}")
+
+# Check that both solutions are close
+print(f"Euclidean distance between solutions: {np.linalg.norm(w_sk - w_ll):.3e}")
+
+# %%
+# Finally, let's compare the timings of both solvers
+
+# time skglm
+start = time.perf_counter()
+solver.solve(X, (tm, s), datafit, penalty)[0]
+end = time.perf_counter()
+
+total_time_skglm = end - start
+
+# time lifelines
+lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.)
+
+start = time.perf_counter()
+lifelines_estimator.fit(
+    df,
+    duration_col=0,
+    event_col=1
+)
+end = time.perf_counter()
+
+total_time_lifelines = end - start
+
+# deduce speed up ratio
+speed_up = total_time_lifelines / total_time_skglm
+print(f"speed up ratio: {speed_up:.0f}")
+
+# %%
+# As shown by the last print, we still preserve the x100 ratio speed up
+# even for the Efron estimate.
@@ -1,7 +1,7 @@
 import numpy as np
 from numpy.linalg import norm
 from numba import njit
-from numba import float64
+from numba import float64, int64, bool_
 
 from skglm.datafits.base import BaseDatafit
 from skglm.utils.sparse_ops import spectral_norm
@@ -547,90 +547,100 @@ def intercept_update_self(self, y, Xw):
 
 
 class Cox(BaseDatafit):
-    r"""Cox datafit for survival analysis with Breslow estimate.
+    r"""Cox datafit for survival analysis.
 
-    The datafit reads [1]
-
-    .. math::
-
-        1 / n_"samples" \sum_(i=1)^(n_"samples") -s_i \langle x_i, w \rangle
-        + \log (\sum_(j | y_j \geq y_i) e^{\langle x_i, w \rangle})
-
-    where :math:`s_i` indicates the sample censorship and :math:`tm`
-    is the vector recording the time of event occurrences.
-
-    Defining the matrix :math:`B` with
-    :math:`B_{i,j} = 1` if  :math:`tm_j \geq tm_i` and :math:`0` otherwise,
-    the datafit can be rewritten in the following compact form
-
-    .. math::
-
-        1 / n_"samples" \langle s, Xw \rangle
-        + 1 / n_"samples" \langle s, \log B e^{Xw} \rangle
+    Refer to :ref:`Mathematics behind Cox datafit <maths_cox_datafit>` for details.
 
+    Parameters
+    ----------
+    use_efron : bool, default=False
+        If ``True`` uses Efron estimate to handle tied observations.
 
     Attributes
     ----------
     B : array-like, shape (n_samples, n_samples)
         Matrix where every ``(i, j)`` entry (row, column) equals ``1``
-        if ``tm[j] >= tm[i]`` and `0` otherwise. This matrix is initialized
+        if ``tm[j] >= tm[i]`` and ``0`` otherwise. This matrix is initialized
         using the ``.initialize`` method.
 
-    References
-    ----------
-    .. [1] DY Lin. On the Breslow estimator.
-           Lifetime data analysis, 13:471–480, 2007.
+    H_indices : array-like, shape (n_samples,)
+        Indices of observations with the same occurrence times stacked horizontally
+        as ``[group_1, group_2, ...]``. This array is initialized
+        when calling ``.initialize`` method when ``use_efron=True``.
+
+    H_indptr : array-like, (np.unique(tm) + 1,)
+        Array where two consecutive elements delimits a group of observations
+        having the same occurrence times.
     """
 
-    def __init__(self):
-        pass
+    def __init__(self, use_efron=False):
+        self.use_efron = use_efron
 
     def get_spec(self):
         return (
+            ('use_efron', bool_),
             ('B', float64[:, ::1]),
+            ('H_indptr', int64[:]),
+            ('H_indices', int64[:]),
         )
 
     def params_to_dict(self):
-        return dict()
+        return dict(use_efron=self.use_efron)
 
     def value(self, y, w, Xw):
         """Compute the value of the datafit."""
         tm, s = y
         n_samples = Xw.shape[0]
 
-        out = -(s @ Xw) + s @ np.log(self.B @ np.exp(Xw))
+        # compute inside log term
+        exp_Xw = np.exp(Xw)
+        B_exp_Xw = self.B @ exp_Xw
+        if self.use_efron:
+            B_exp_Xw -= self._A_dot_vec(exp_Xw)
+
+        out = -(s @ Xw) + s @ np.log(B_exp_Xw)
         return out / n_samples
 
     def raw_grad(self, y, Xw):
         r"""Compute gradient of datafit w.r.t. ``Xw``.
 
-        The raw gradient reads
-
-            (-s + exp_Xw * (B.T @ (s / B @ exp_Xw)) / n_samples
+        Refer to :ref:`Mathematics behind Cox datafit <maths_cox_datafit>`
+        equation 4 for details.
         """
         tm, s = y
         n_samples = Xw.shape[0]
 
         exp_Xw = np.exp(Xw)
         B_exp_Xw = self.B @ exp_Xw
+        if self.use_efron:
+            B_exp_Xw -= self._A_dot_vec(exp_Xw)
+
+        s_over_B_exp_Xw = s / B_exp_Xw
+        out = -s + exp_Xw * (self.B.T @ (s_over_B_exp_Xw))
+        if self.use_efron:
+            out -= exp_Xw * self._AT_dot_vec(s_over_B_exp_Xw)
 
-        out = -s + exp_Xw * (self.B.T @ (s / B_exp_Xw))
         return out / n_samples
 
     def raw_hessian(self, y, Xw):
         """Compute a diagonal upper bound of the datafit's Hessian w.r.t. ``Xw``.
 
-        The diagonal upper bound reads
-
-            exp_Xw * (B.T @ s / B_exp_Xw) / n_samples
+        Refer to :ref:`Mathematics behind Cox datafit <maths_cox_datafit>`
+        equation 6 for details.
         """
         tm, s = y
         n_samples = Xw.shape[0]
 
         exp_Xw = np.exp(Xw)
         B_exp_Xw = self.B @ exp_Xw
+        if self.use_efron:
+            B_exp_Xw -= self._A_dot_vec(exp_Xw)
+
+        s_over_B_exp_Xw = s / B_exp_Xw
+        out = exp_Xw * (self.B.T @ (s_over_B_exp_Xw))
+        if self.use_efron:
+            out -= exp_Xw * self._AT_dot_vec(s_over_B_exp_Xw)
 
-        out = exp_Xw * (self.B.T @ (s / B_exp_Xw))
         return out / n_samples
 
     def initialize(self, X, y):
@@ -640,9 +650,58 @@ def initialize(self, X, y):
         tm_as_col = tm.reshape((-1, 1))
         self.B = (tm >= tm_as_col).astype(X.dtype)
 
+        if self.use_efron:
+            H_indices = np.argsort(tm)
+            # filter out censored data
+            H_indices = H_indices[s[H_indices] != 0]
+            n_uncensored_samples = H_indices.shape[0]
+
+            # build H_indptr
+            H_indptr = [0]
+            count = 1
+            for i in range(1, n_uncensored_samples):
+                if tm[H_indices[i-1]] == tm[H_indices[i]]:
+                    count += 1
+                else:
+                    H_indptr.append(count + H_indptr[-1])
+                    count = 1
+            H_indptr.append(n_uncensored_samples)
+            H_indptr = np.asarray(H_indptr, dtype=np.int64)
+
+            # save in instance
+            self.H_indptr = H_indptr
+            self.H_indices = H_indices
+
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         """Initialize the datafit attributes in sparse dataset case."""
-        tm, s = y
+        # initialize_sparse and initialize have the same implementation
+        # small hack to avoid repetitive code: pass in X_data as only its dtype is used
+        self.initialize(X_data, y)
 
-        tm_as_col = tm.reshape((-1, 1))
-        self.B = (tm >= tm_as_col).astype(X_data.dtype)
+    def _A_dot_vec(self, vec):
+        out = np.zeros_like(vec)
+        n_H = self.H_indptr.shape[0] - 1
+
+        for idx in range(n_H):
+            current_H_idx = self.H_indices[self.H_indptr[idx]: self.H_indptr[idx+1]]
+            size_current_H = current_H_idx.shape[0]
+            frac_range = np.arange(size_current_H, dtype=vec.dtype) / size_current_H
+
+            sum_vec_H = np.sum(vec[current_H_idx])
+            out[current_H_idx] = sum_vec_H * frac_range
+
+        return out
+
+    def _AT_dot_vec(self, vec):
+        out = np.zeros_like(vec)
+        n_H = self.H_indptr.shape[0] - 1
+
+        for idx in range(n_H):
+            current_H_idx = self.H_indices[self.H_indptr[idx]: self.H_indptr[idx+1]]
+            size_current_H = current_H_idx.shape[0]
+            frac_range = np.arange(size_current_H, dtype=vec.dtype) / size_current_H
+
+            weighted_sum_vec_H = vec[current_H_idx] @ frac_range
+            out[current_H_idx] = weighted_sum_vec_H * np.ones(size_current_H)
+
+        return out
@@ -116,7 +116,8 @@ def test_gamma():
     np.testing.assert_allclose(clf.coef_, gamma_results.params, rtol=1e-6)
 
 
-def test_cox():
+@pytest.mark.parametrize("use_efron", [True, False])
+def test_cox(use_efron):
     rng = np.random.RandomState(1265)
     n_samples, n_features = 10, 30
 
@@ -131,7 +132,7 @@ def test_cox():
     Xw = X @ w
 
     # check datafit
-    cox_df = compiled_clone(Cox())
+    cox_df = compiled_clone(Cox(use_efron))
 
     cox_df.initialize(X, (tm, s))
     cox_df.value(y, w, Xw)