Merge pull request #240 from CARRIER-project/226-last-tweaks-documentation

dsmits · web-flow · commit 39cc84a6d80f · 2025-04-09T13:35:21.000+02:00
226 last tweaks documentation
diff --git a/demo.ipynb b/demo.ipynb
@@ -286,9 +286,6 @@
     "## Running cox proportional hazard analysis\n",
     "If you want to fit a model on the entire dataset you can run `VerticoxClient.fit`\n",
     "\n",
-    "### Docstring\n",
-    "Run cox proportional hazard analysis on the entire dataset.\n",
-    "\n",
     "Args:\n",
     "\n",
     "- __feature_columns:__ a list of column names that you want to use as features\n",
diff --git a/docs/development.md b/docs/development.md
@@ -57,3 +57,12 @@ analysis.
 ```
 
 ## Further development
+If this code would be developed further, it would be good to add the following features:
+
+### Sample size threshold
+In order to further prevent data leakage, a threshold should be added to prevent the 
+analysis from being run if the sample size is below a certain threshold. 
+
+### Upgrade to latest vantage6 version
+The current version of vantage6 that is being used is 4.7.1. In order to make it compatible with
+the latest version, the code should comply with the new client api.
diff --git a/docs/images/vantage6.png b/docs/images/vantage6.png
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -13,17 +13,28 @@ nav:
 
 theme:
   name: material
+  logo: images/vantage6.png
+  palette:
+    primary: white
 plugins:
   - mkdocstrings:
       handlers:
         python:
           paths: [python]
+      docstring_style: google
   - search
 
 markdown_extensions:
   - pymdownx.arithmatex:
       generic: true
   - pymdownx.blocks.caption
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences
 
 extra_javascript:
   - javascripts/mathjax.js
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -56,4 +56,6 @@ build-backend = "setuptools.build_meta"
 pythonpath = "."
 addopts = "-v --log-level=INFO --log-file=test.log"
 
+[tool.ruff]
+indent-width = 4
 
diff --git a/python/tests/test_verticox_v6.py b/python/tests/test_verticox_v6.py
@@ -1,10 +1,8 @@
 #! /usr/bin/env python3
 
 import json
-import numpy as np
 import vantage6.client as v6client
 from clize import run
-from verticox.client import FitResult
 
 from test_constants import OUTCOME_TIME_COLUMN, OUTCOME, PRECISION
 from verticox.client import VerticoxClient
@@ -17,7 +15,7 @@
 
 
 def run_verticox_v6(host, port, user, password, *, private_key=None, image: str=IMAGE,
-                    method="fit"):
+                    method="fit", precision: float = PRECISION):
 
     client = v6client.Client(host, port, log_level="warning")
 
@@ -63,31 +61,24 @@ def run_verticox_v6(host, port, user, password, *, private_key=None, image: str=
                 OUTCOME,
                 feature_nodes=feature_orgs,
                 outcome_node=central_node,
-                precision=PRECISION,
+                precision=precision,
                 database=DATABASE,
             )
         case "crossval":
             task = verticox_client.cross_validate(
                 feature_columns,
                 OUTCOME_TIME_COLUMN,
                 OUTCOME,
-                feature_nodes=datanodes,
+                feature_nodes=feature_orgs,
                 outcome_node=central_node,
-                precision=PRECISION,
+                precision=precision,
                 database=DATABASE,
             )
 
-    results = task.get_results(timeout=TIMEOUT)
+    results = task.get_results()
 
     print("Results: ", results)
 
-    match results:
-        case FitResult(coefs, baseline_hazard):
-            for key, value in coefs.items():
-                np.testing.assert_almost_equal(value, TARGET_COEFS[key], decimal=4)
-
-    print("Test passed")
-
 
 if __name__ == "__main__":
     run(run_verticox_v6)
diff --git a/python/verticox/client.py b/python/verticox/client.py
@@ -42,14 +42,29 @@ def __init__(
 
         self.collaboration_id = collaborations[0]["id"]
 
-    def get_active_node_organizations(self):
+    def get_active_node_organizations(self) -> List[int]:
+        """
+        Get the organization ids of the active nodes in the collaboration.
+
+        Returns: a list of organization ids
+
+        """
         nodes = self._v6client.node.list(is_online=True)
 
         # TODO: Add pagination support
         nodes = nodes["data"]
         return [n["organization"]["id"] for n in nodes]
 
     def get_column_names(self, **kwargs):
+        """
+        Get the column names of the dataset at all active nodes.
+
+        Args:
+            **kwargs:
+
+        Returns:
+
+        """
         active_nodes = self.get_active_node_organizations()
         self._logger.debug(f"There are currently {len(active_nodes)} active nodes")
 
@@ -84,7 +99,7 @@ def fit(
             database: If the nodes have multiple datasources, indicate the label of the datasource
             you would like to use. Otherwise the default will be used.
 
-        Returns:
+        Returns: a `Task` object containing info about the task.
 
         """
         input_params = {
@@ -107,14 +122,36 @@ def cross_validate(self,
                        feature_nodes,
                        outcome_node,
                        precision=_DEFAULT_PRECISION,
+                       n_splits = 10,
                        database="default"):
+        """
+        Run cox proportional hazard analysis on the entire dataset using cross-validation. Uses 10
+        fold by default.
+
+        Args:
+            feature_columns: a list of column names that you want to use as features
+            outcome_time_column: the column name of the outcome time
+            right_censor_column: the column name of the binary value that indicates if an event
+            happened.
+            feature_nodes: A list of node ids from the datasources that contain the feature columns
+            outcome_node: The node id of the datasource that contains the outcome
+            precision: precision of the verticox algorithm. The smaller the number, the more
+            precise the result. Smaller precision will take longer to compute though. The default is
+            1e-5
+            n_splits: The number of folds to use for cross-validation. Default is 10.
+            database: If the nodes have multiple datasources, indicate the label of the datasource
+            you would like to use. Otherwise the default will be used.
+
+        Returns: a `Task` object containing info about the task.
+        """
         input_params = {
             "feature_columns": feature_columns,
             "event_times_column": outcome_time_column,
             "event_happened_column": right_censor_column,
             "datanode_ids": feature_nodes,
             "central_node_id": outcome_node,
             "convergence_precision": precision,
+            "n_splits": n_splits,
         }
 
         return self._run_task(
@@ -166,6 +203,10 @@ def _run_task(
 
 @dataclass
 class FitResult:
+    """
+    FitResult contains the result of a fit task. It contains the coefficients and the baseline
+    hazard function.
+    """
     coefs: Dict[str, float]
     baseline_hazard: HazardFunction
 
@@ -191,6 +232,10 @@ def plot(self):
 
 @dataclass
 class CrossValResult:
+    """
+    CrossValResult contains the result of a cross-validation task. It contains the c-indices,
+    coefficients and baseline hazard functions for each fold.
+    """
     c_indices: List[float]
     coefs: List[Dict[str, float]]
     baseline_hazards: List[HazardFunction]
@@ -217,20 +262,27 @@ def plot(self):
 
 
 class Task:
-
+    """
+    Task is a wrapper around the vantage6 task object.
+    """
     def __init__(self, client: Client, task_data):
         self._raw_data = task_data
         self.client = client
         self.task_id = task_data["id"]
 
-    def get_results(self, timeout=_TIMEOUT):
+    def get_results(self) -> PartialResult:
+        """
+        Get the results of the task. This will block until the task is finished.
+
+        Returns:
+
+        """
         results = self.client.wait_for_results(self.task_id)
-        print(f"Results: {results}")
         return self._parse_results(results["data"])
 
 
     @staticmethod
-    def _parse_results(results):
+    def _parse_results(results) -> FitResult| CrossValResult:
         return results
 
 
diff --git a/python/verticox/vantage6.py b/python/verticox/vantage6.py