MNT - compatibility of Cox datafit with L2 regularization (#167)

Badr-MOUFAD · web-flow · commit e7048b6fae4e · 2023-06-15T16:13:58.000+02:00
diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
@@ -646,6 +646,10 @@ def raw_hessian(self, y, Xw):
 
         return out / n_samples
 
+    def gradient(self, X, y, Xw):
+        """Compute gradient of the datafit."""
+        return X.T @ self.raw_grad(y, Xw)
+
     def initialize(self, X, y):
         """Initialize the datafit attributes."""
         tm, s = y
diff --git a/skglm/tests/test_lbfgs_solver.py b/skglm/tests/test_lbfgs_solver.py
@@ -1,18 +1,20 @@
+import pytest
 import numpy as np
+import pandas as pd
 
-from skglm.solvers import LBFGS
 from skglm.penalties import L2
-from skglm.datafits import Logistic
+from skglm.solvers import LBFGS
+from skglm.datafits import Logistic, Cox
 
 from sklearn.linear_model import LogisticRegression
 
-from skglm.utils.data import make_correlated_data
 from skglm.utils.jit_compilation import compiled_clone
+from skglm.utils.data import make_correlated_data, make_dummy_survival_data
 
 
 def test_lbfgs_L2_logreg():
     reg = 1.
-    n_samples, n_features = 50, 10
+    n_samples, n_features = 100, 50
 
     X, y, _ = make_correlated_data(
         n_samples, n_features, random_state=0)
@@ -21,19 +23,59 @@ def test_lbfgs_L2_logreg():
     # fit L-BFGS
     datafit = compiled_clone(Logistic())
     penalty = compiled_clone(L2(reg))
-    w, *_ = LBFGS().solve(X, y, datafit, penalty)
+    w, *_ = LBFGS(tol=1e-12).solve(X, y, datafit, penalty)
 
     # fit scikit learn
     estimator = LogisticRegression(
         penalty='l2',
         C=1 / (n_samples * reg),
-        fit_intercept=False
-    )
-    estimator.fit(X, y)
+        fit_intercept=False,
+        tol=1e-12,
+    ).fit(X, y)
+
+    np.testing.assert_allclose(w, estimator.coef_.flatten())
+
+
+@pytest.mark.parametrize("use_efron", [True, False])
+def test_L2_Cox(use_efron):
+    try:
+        from lifelines import CoxPHFitter
+    except ModuleNotFoundError:
+        pytest.xfail(
+            "Testing L2 Cox Estimator requires `lifelines` packages\n"
+            "Run `pip install lifelines`"
+        )
+
+    alpha = 10.
+    n_samples, n_features = 100, 50
 
-    np.testing.assert_allclose(
-        w, estimator.coef_.flatten(), atol=1e-4
+    tm, s, X = make_dummy_survival_data(
+        n_samples, n_features, normalize=True,
+        with_ties=use_efron, random_state=0)
+
+    datafit = compiled_clone(Cox(use_efron))
+    penalty = compiled_clone(L2(alpha))
+
+    datafit.initialize(X, (tm, s))
+    w, *_ = LBFGS().solve(X, (tm, s), datafit, penalty)
+
+    # fit lifeline estimator
+    stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
+    df = pd.DataFrame(stacked_tm_s_X)
+
+    estimator = CoxPHFitter(penalizer=alpha, l1_ratio=0.).fit(
+        df, duration_col=0, event_col=1
     )
+    w_ll = estimator.params_.values
+
+    p_obj_skglm = datafit.value((tm, s), w, X @ w) + penalty.value(w)
+    p_obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+
+    # despite increasing tol in lifelines, solutions are quite far apart
+    # suspecting lifelines https://github.com/CamDavidsonPilon/lifelines/pull/1534
+    # as our solution gives the lowest objective value
+    np.testing.assert_allclose(w, w_ll, rtol=1e-1)
+    np.testing.assert_allclose(p_obj_skglm, p_obj_ll, rtol=1e-6)
 
 
 if __name__ == "__main__":