MNT - Switch reponse y convention in Cox estomation (#175)

Badr-MOUFAD · web-flow · commit 18e645677cb1 · 2023-06-21T14:29:53.000+02:00
diff --git a/examples/plot_survival_analysis.py b/examples/plot_survival_analysis.py
@@ -18,18 +18,20 @@
 from skglm.utils.data import make_dummy_survival_data
 
 n_samples, n_features = 500, 100
-tm, s, X = make_dummy_survival_data(
+X, y = make_dummy_survival_data(
     n_samples, n_features,
     normalize=True,
     random_state=0
 )
 
+tm, s = y[:, 0], y[:, 1]
+
 # %%
 # The synthetic data has the following properties:
 #
+# * ``X`` is the matrix of predictors, generated using standard normal distribution with Toeplitz covariance.
 # * ``tm`` is the vector of occurrence times which follows a Weibull(1) distribution
 # * ``s`` indicates the observations censorship and follows a Bernoulli(0.5) distribution
-# * ``X`` is the matrix of predictors, generated using standard normal distribution with Toeplitz covariance.
 #
 # Let's inspect the data quickly:
 import matplotlib.pyplot as plt
@@ -70,13 +72,13 @@
 datafit = compiled_clone(Cox())
 penalty = compiled_clone(L1(alpha))
 
-datafit.initialize(X, (tm, s))
+datafit.initialize(X, y)
 
 # init solver
 solver = ProxNewton(fit_intercept=False, max_iter=50)
 
 # solve the problem
-w_sk = solver.solve(X, (tm, s), datafit, penalty)[0]
+w_sk = solver.solve(X, y, datafit, penalty)[0]
 
 # %%
 # For this data a regularization value a relatively sparse solution is found:
@@ -93,8 +95,8 @@
 from lifelines import CoxPHFitter
 
 # format data
-stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
-df = pd.DataFrame(stacked_tm_s_X)
+stacked_y_X = np.hstack((y, X))
+df = pd.DataFrame(stacked_y_X)
 
 # fit lifelines estimator
 lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.).fit(
@@ -106,8 +108,8 @@
 
 # %%
 # Check that both solvers find solutions having the same objective value:
-obj_sk = datafit.value((tm, s), w_sk, X @ w_sk) + penalty.value(w_sk)
-obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk)
+obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
 
 print(f"Objective skglm: {obj_sk:.6f}")
 print(f"Objective lifelines: {obj_ll:.6f}")
@@ -141,11 +143,11 @@
     solver.max_iter = n_iter
 
     start = time.perf_counter()
-    w = solver.solve(X, (tm, s), datafit, penalty)[0]
+    w = solver.solve(X, y, datafit, penalty)[0]
     end = time.perf_counter()
 
     records["skglm"]["objs"].append(
-        datafit.value((tm, s), w, X @ w) + penalty.value(w)
+        datafit.value(y, w, X @ w) + penalty.value(w)
     )
     records["skglm"]["times"].append(end - start)
 
@@ -164,7 +166,7 @@
     w = lifelines_estimator.params_.values
 
     records["lifelines"]["objs"].append(
-        datafit.value((tm, s), w, X @ w) + penalty.value(w)
+        datafit.value(y, w, X @ w) + penalty.value(w)
     )
     records["lifelines"]["times"].append(end - start)
 
@@ -212,12 +214,13 @@
 #
 # Let's start by generating data with tied observations. This can be achieved
 # by passing in a ``with_ties=True`` to ``make_dummy_survival_data`` function.
-tm, s, X = make_dummy_survival_data(
+X, y = make_dummy_survival_data(
     n_samples, n_features,
     normalize=True,
     with_ties=True,
     random_state=0
 )
+tm, s = y[:, 0], y[:, 1]
 
 # check the data has tied observations
 print(f"Number of unique times {len(np.unique(tm))} out of {n_samples}")
@@ -228,11 +231,11 @@
 
 # ensure using Efron estimate
 datafit = compiled_clone(Cox(use_efron=True))
-datafit.initialize(X, (tm, s))
+datafit.initialize(X, y)
 
 # solve the problem
 solver = ProxNewton(fit_intercept=False, max_iter=50)
-w_sk = solver.solve(X, (tm, s), datafit, penalty)[0]
+w_sk = solver.solve(X, y, datafit, penalty)[0]
 
 # %%
 # Again a relatively sparse solution is found:
@@ -257,8 +260,8 @@
 w_ll = lifelines_estimator.params_.values
 
 # Check that both solvers find solutions with the same objective value
-obj_sk = datafit.value((tm, s), w_sk, X @ w_sk) + penalty.value(w_sk)
-obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk)
+obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
 
 print(f"Objective skglm: {obj_sk:.6f}")
 print(f"Objective lifelines: {obj_ll:.6f}")
@@ -272,7 +275,7 @@
 
 # time skglm
 start = time.perf_counter()
-solver.solve(X, (tm, s), datafit, penalty)[0]
+solver.solve(X, y, datafit, penalty)[0]
 end = time.perf_counter()
 
 total_time_skglm = end - start
diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
@@ -607,7 +607,7 @@ def params_to_dict(self):
 
     def value(self, y, w, Xw):
         """Compute the value of the datafit."""
-        tm, s = y
+        tm, s = y[:, 0], y[:, 1]  # noqa
         n_samples = Xw.shape[0]
 
         # compute inside log term
@@ -625,7 +625,7 @@ def raw_grad(self, y, Xw):
         Refer to :ref:`Mathematics behind Cox datafit <maths_cox_datafit>`
         equation 4 for details.
         """
-        tm, s = y
+        tm, s = y[:, 0], y[:, 1]  # noqa
         n_samples = Xw.shape[0]
 
         exp_Xw = np.exp(Xw)
@@ -646,7 +646,7 @@ def raw_hessian(self, y, Xw):
         Refer to :ref:`Mathematics behind Cox datafit <maths_cox_datafit>`
         equation 6 for details.
         """
-        tm, s = y
+        tm, s = y[:, 0], y[:, 1]  # noqa
         n_samples = Xw.shape[0]
 
         exp_Xw = np.exp(Xw)
@@ -678,7 +678,7 @@ def gradient_sparse(self, X_data, X_indptr, X_indices, y, Xw):
 
     def initialize(self, X, y):
         """Initialize the datafit attributes."""
-        tm, s = y
+        tm, s = y[:, 0], y[:, 1]  # noqa
 
         self.T_indices = np.argsort(tm)
         self.T_indptr = self._get_indptr(tm, self.T_indices)
diff --git a/skglm/tests/test_datafits.py b/skglm/tests/test_datafits.py
@@ -11,6 +11,7 @@
 from skglm import GeneralizedLinearEstimator
 from skglm.utils.data import make_correlated_data
 from skglm.utils.jit_compilation import compiled_clone
+from skglm.utils.data import make_dummy_survival_data
 
 
 @pytest.mark.parametrize('fit_intercept', [False, True])
@@ -122,10 +123,8 @@ def test_cox(use_efron):
     n_samples, n_features = 10, 30
 
     # generate data
-    X = rng.randn(n_samples, n_features)
-    tm = rng.choice(n_samples*n_features, size=n_samples, replace=True).astype(float)
-    s = rng.choice(2, size=n_samples).astype(float)
-    y = (tm, s)
+    X, y = make_dummy_survival_data(n_samples, n_features, normalize=True,
+                                    with_ties=use_efron, random_state=0)
 
     # generate dummy w, Xw
     w = rng.randn(n_features)
@@ -134,7 +133,7 @@ def test_cox(use_efron):
     # check datafit
     cox_df = compiled_clone(Cox(use_efron))
 
-    cox_df.initialize(X, (tm, s))
+    cox_df.initialize(X, y)
     cox_df.value(y, w, Xw)
 
     # perform test 10 times to consider truncation errors
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
@@ -184,9 +184,10 @@ def test_CoxEstimator(use_efron, use_float_32):
     n_samples, n_features = 100, 30
     random_state = 1265
 
-    tm, s, X = make_dummy_survival_data(n_samples, n_features, normalize=True,
-                                        with_ties=use_efron, use_float_32=use_float_32,
-                                        random_state=random_state)
+    X, y = make_dummy_survival_data(n_samples, n_features, normalize=True,
+                                    with_ties=use_efron, use_float_32=use_float_32,
+                                    random_state=random_state)
+    tm, s = y[:, 0], y[:, 1]
 
     # compute alpha_max
     B = (tm >= tm[:, None]).astype(X.dtype)
@@ -199,17 +200,17 @@ def test_CoxEstimator(use_efron, use_float_32):
     datafit = compiled_clone(Cox(use_efron))
     penalty = compiled_clone(L1(alpha))
 
-    datafit.initialize(X, (tm, s))
+    datafit.initialize(X, y)
 
     w, *_ = ProxNewton(
         fit_intercept=False, tol=1e-6, max_iter=50
     ).solve(
-        X, (tm, s), datafit, penalty
+        X, y, datafit, penalty
     )
 
     # fit lifeline estimator
-    stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
-    df = pd.DataFrame(stacked_tm_s_X)
+    stacked_y_X = np.hstack((y, X))
+    df = pd.DataFrame(stacked_y_X)
 
     estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.)
     estimator.fit(
@@ -218,8 +219,8 @@ def test_CoxEstimator(use_efron, use_float_32):
     )
     w_ll = estimator.params_.values.astype(X.dtype)
 
-    p_obj_skglm = datafit.value((tm, s), w, X @ w) + penalty.value(w)
-    p_obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+    p_obj_skglm = datafit.value(y, w, X @ w) + penalty.value(w)
+    p_obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
 
     # though norm of solution might differ
     np.testing.assert_allclose(p_obj_skglm, p_obj_ll, atol=1e-6)
@@ -232,9 +233,10 @@ def test_CoxEstimator_sparse(use_efron, use_float_32):
     n_samples, n_features = 100, 30
     X_density, random_state = 0.5, 1265
 
-    tm, s, X = make_dummy_survival_data(n_samples, n_features, X_density=X_density,
-                                        use_float_32=use_float_32, with_ties=use_efron,
-                                        random_state=random_state)
+    X, y = make_dummy_survival_data(n_samples, n_features, X_density=X_density,
+                                    use_float_32=use_float_32, with_ties=use_efron,
+                                    random_state=random_state)
+    tm, s = y[:, 0], y[:, 1]
 
     # compute alpha_max
     B = (tm >= tm[:, None]).astype(X.dtype)
@@ -247,12 +249,12 @@ def test_CoxEstimator_sparse(use_efron, use_float_32):
     datafit = compiled_clone(Cox(use_efron))
     penalty = compiled_clone(L1(alpha))
 
-    datafit.initialize_sparse(X.data, X.indptr, X.indices, (tm, s))
+    datafit.initialize_sparse(X.data, X.indptr, X.indices, y)
 
     *_, stop_crit = ProxNewton(
         fit_intercept=False, tol=1e-6, max_iter=50
     ).solve(
-        X, (tm, s), datafit, penalty
+        X, y, datafit, penalty
     )
 
     np.testing.assert_allclose(stop_crit, 0., atol=1e-6)
diff --git a/skglm/tests/test_lbfgs_solver.py b/skglm/tests/test_lbfgs_solver.py
@@ -52,27 +52,27 @@ def test_L2_Cox(use_efron):
     alpha = 10.
     n_samples, n_features = 100, 50
 
-    tm, s, X = make_dummy_survival_data(
+    X, y = make_dummy_survival_data(
         n_samples, n_features, normalize=True,
         with_ties=use_efron, random_state=0)
 
     datafit = compiled_clone(Cox(use_efron))
     penalty = compiled_clone(L2(alpha))
 
-    datafit.initialize(X, (tm, s))
-    w, *_ = LBFGS().solve(X, (tm, s), datafit, penalty)
+    datafit.initialize(X, y)
+    w, *_ = LBFGS().solve(X, y, datafit, penalty)
 
     # fit lifeline estimator
-    stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
-    df = pd.DataFrame(stacked_tm_s_X)
+    stacked_y_X = np.hstack((y, X))
+    df = pd.DataFrame(stacked_y_X)
 
     estimator = CoxPHFitter(penalizer=alpha, l1_ratio=0.).fit(
         df, duration_col=0, event_col=1
     )
     w_ll = estimator.params_.values
 
-    p_obj_skglm = datafit.value((tm, s), w, X @ w) + penalty.value(w)
-    p_obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
+    p_obj_skglm = datafit.value(y, w, X @ w) + penalty.value(w)
+    p_obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
 
     # despite increasing tol in lifelines, solutions are quite far apart
     # suspecting lifelines https://github.com/CamDavidsonPilon/lifelines/pull/1534
diff --git a/skglm/utils/data.py b/skglm/utils/data.py
@@ -160,14 +160,13 @@ def make_dummy_survival_data(n_samples, n_features, normalize=False, X_density=1
 
     Returns
     -------
-    tm : array-like, shape (n_samples,)
-        The vector of recording the time of event occurrences
-
-    s : array-like, shape (n_samples,)
-        The vector of indicating samples censorship
-
     X : array-like, shape (n_samples, n_features)
         The matrix of predictors. If ``density < 1``, a CSC sparse matrix is returned.
+
+    y : array-like, shape (n_samples, 2)
+        Two-column array where the first column ``tm`` is the vector
+        recording the time of event occurrences, and the second column ``s``
+        is the vector of sample censoring.
     """
     rng = np.random.RandomState(random_state)
     dtype = np.float64 if use_float_32 is False else np.float32
@@ -189,7 +188,10 @@ def make_dummy_survival_data(n_samples, n_features, normalize=False, X_density=1
     if normalize and X_density == 1.:
         X = StandardScaler().fit_transform(X)
 
-    return tm, s, X
+    # stack (tm, s)
+    y = np.column_stack((tm, s)).astype(dtype, order='F')
+
+    return X, y
 
 
 def grp_converter(groups, n_features):