Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions docs/user-guide/analyses/survival.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,96 @@ or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.t
The `Survival` reports the number of days until attaining the endpoint,
here defined as end stage renal disease (`is_censored=False`),
or until the individual dropped out of the analysis (`is_censored=True`).


Troubleshooting
===============

Sometimes the survival analysis fails and an :class:`~gpsea.analysis.AnalysisException` is raised.
For instance, the current Logrank test implementation reports a p value of `NaN`
if the survival is the same for all individuals.
This is unlikely an expected outcome, therefore GPSEA raises
an :class:`~gpsea.analysis.AnalysisException` to force the user to troubleshoot.

To help with troubleshooting, the data computed prior detecting the error is included in the exception's
:attr:`~gpsea.analysis.AnalysisException.data` attribute. In survival analysis, the data should include
the identifiers, genotype classes, and survivals of the tested individuals.

Let's show this on an example. We will create a toy cohort of 10 individuals
with onset of `Lynch syndrome I <https://hpo.jax.org/browse/disease/OMIM:120435>`_
(`OMIM:120435`) at 40 years.

>>> from gpsea.model import Cohort, Patient, Disease, Age
>>> onset = Age.from_iso8601_period("P40Y")
>>> individuals = [
... Patient.from_raw_parts(
... labels=label,
... diseases=(
... Disease.from_raw_parts(
... term_id="OMIM:120435",
... name="Lynch syndrome I",
... is_observed=True,
... onset=onset,
... ),
... ),
... )
... for label in "ABCDEFGHIJ" # 10 individuals
... ]
>>> cohort = Cohort.from_patients(individuals)

We will assign them into genotype classes on random, ...

>>> from gpsea.analysis.clf import random_classifier
>>> gt_clf = random_classifier(seed=123)
>>> gt_clf.description
'Classify the individual into random classes'

... using the Lynch syndrome I diagnosis as the endpoint ...

>>> from gpsea.analysis.temporal.endpoint import disease_onset
>>> endpoint = disease_onset(disease_id="OMIM:120435")
>>> endpoint.description
'Compute time until OMIM:120435 onset'

... and we will use Logrank test for differences in survival.

>>> from gpsea.analysis.temporal.stats import LogRankTest
>>> survival_statistic = LogRankTest()

We put together the survival analysis ...

>>> from gpsea.analysis.temporal import SurvivalAnalysis
>>> survival_analysis = SurvivalAnalysis(
... statistic=survival_statistic,
... )

... which we expect to fail with an :class:`~gpsea.analysis.AnalysisException`:

>>> result = survival_analysis.compare_genotype_vs_survival(
... cohort=cohort,
... gt_clf=gt_clf,
... endpoint=endpoint,
... )
Traceback (most recent call last):
...
gpsea.analysis._base.AnalysisException: The survival values did not meet the expectation of the statistical test!

The genotype classes and survival values can be retrieved from the exception:

>>> from gpsea.analysis import AnalysisException
>>> try:
... result = survival_analysis.compare_genotype_vs_survival(
... cohort=cohort,
... gt_clf=gt_clf,
... endpoint=endpoint,
... )
... except AnalysisException as ae:
... genotypes = ae.data["genotype"]
... survivals = ae.data["survival"]

and the values can come in handy in troubleshooting:

>>> genotypes[:3]
(0, 0, 0)
>>> survivals[:3]
(Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False))
5 changes: 5 additions & 0 deletions src/gpsea/analysis/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta):
that tested a single genotype-phenotype association.
"""

SAMPLE_ID = "patient_id"
"""
Name of the data index.
"""

GT_COL = "genotype"
"""
Name of column for storing genotype data.
Expand Down
2 changes: 2 additions & 0 deletions src/gpsea/analysis/clf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
monoallelic_classifier,
biallelic_classifier,
allele_count,
random_classifier,
)
from ._util import (
prepare_classifiers_for_terms_of_interest,
Expand All @@ -28,6 +29,7 @@
"monoallelic_classifier",
"biallelic_classifier",
"allele_count",
"random_classifier",
"PhenotypeClassifier",
"PhenotypeCategorization",
"P",
Expand Down
60 changes: 60 additions & 0 deletions src/gpsea/analysis/clf/_gt_classifiers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
import typing

from collections import Counter
Expand Down Expand Up @@ -640,3 +641,62 @@ def diagnosis_classifier(
diagnoses=diagnoses,
labels=labels,
)


class RandomClassifier(GenotypeClassifier):

CATS = (
Categorization(
category=PatientCategory(
cat_id=0, name="A",
)
),
Categorization(
category=PatientCategory(
cat_id=1, name="B",
)
)
)

def __init__(
self,
seed: typing.Optional[float] = None,
):
self._rng = random.Random(x=seed)

@property
def name(self) -> str:
return "Random Classifier"

@property
def description(self) -> str:
return "Classify the individual into random classes"

@property
def variable_name(self) -> str:
return "Randomness"

def get_categorizations(self) -> typing.Sequence[Categorization]:
return RandomClassifier.CATS

def test(self, _: Patient) -> typing.Optional[Categorization]:
return self._rng.choice(RandomClassifier.CATS)

def __eq__(self, value: object) -> bool:
return isinstance(value, RandomClassifier) and self._rng == value._rng

def __hash__(self) -> int:
return hash((self._rng, ))


def random_classifier(
seed: typing.Optional[float] = None,
) -> GenotypeClassifier:
"""
Genotype classifier to assign an individual into one of two classes, `A` and `B` on random..

:param seed: the seed for the random number generator.
"""
return RandomClassifier(
seed=seed,
)
13 changes: 11 additions & 2 deletions src/gpsea/analysis/temporal/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,10 @@ def compare_genotype_vs_survival(
Execute the survival analysis on a given `cohort`.
"""

idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
idx = pd.Index(
(patient.patient_id for patient in cohort),
name=MonoPhenotypeAnalysisResult.SAMPLE_ID,
)
data = pd.DataFrame(
None,
index=idx,
Expand All @@ -194,8 +197,14 @@ def compare_genotype_vs_survival(
vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations())
result = self._statistic.compute_pval(vals)
if math.isnan(result.pval):
partial = {
MonoPhenotypeAnalysisResult.SAMPLE_ID: tuple(data.index),
"genotype": tuple(data[MonoPhenotypeAnalysisResult.GT_COL]),
"survival": tuple(data[MonoPhenotypeAnalysisResult.PH_COL]),
}

raise AnalysisException(
dict(data=data),
partial,
"The survival values did not meet the expectation of the statistical test!",
)

Expand Down
Loading