Skip to content

Commit 5518116

Browse files
authored
Merge pull request #399 from monarch-initiative/report-partial-data-from-survival-analysis
Report partial data if survival statistics fails
2 parents d0f69d0 + 2721f33 commit 5518116

File tree

5 files changed

+171
-2
lines changed

5 files changed

+171
-2
lines changed

docs/user-guide/analyses/survival.rst

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,96 @@ or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.t
188188
The `Survival` reports the number of days until attaining the endpoint,
189189
here defined as end stage renal disease (`is_censored=False`),
190190
or until the individual dropped out of the analysis (`is_censored=True`).
191+
192+
193+
Troubleshooting
194+
===============
195+
196+
Sometimes the survival analysis fails and an :class:`~gpsea.analysis.AnalysisException` is raised.
197+
For instance, the current Logrank test implementation reports a p value of `NaN`
198+
if the survival is the same for all individuals.
199+
This is unlikely an expected outcome, therefore GPSEA raises
200+
an :class:`~gpsea.analysis.AnalysisException` to force the user to troubleshoot.
201+
202+
To help with troubleshooting, the data computed prior detecting the error is included in the exception's
203+
:attr:`~gpsea.analysis.AnalysisException.data` attribute. In survival analysis, the data should include
204+
the identifiers, genotype classes, and survivals of the tested individuals.
205+
206+
Let's show this on an example. We will create a toy cohort of 10 individuals
207+
with onset of `Lynch syndrome I <https://hpo.jax.org/browse/disease/OMIM:120435>`_
208+
(`OMIM:120435`) at 40 years.
209+
210+
>>> from gpsea.model import Cohort, Patient, Disease, Age
211+
>>> onset = Age.from_iso8601_period("P40Y")
212+
>>> individuals = [
213+
... Patient.from_raw_parts(
214+
... labels=label,
215+
... diseases=(
216+
... Disease.from_raw_parts(
217+
... term_id="OMIM:120435",
218+
... name="Lynch syndrome I",
219+
... is_observed=True,
220+
... onset=onset,
221+
... ),
222+
... ),
223+
... )
224+
... for label in "ABCDEFGHIJ" # 10 individuals
225+
... ]
226+
>>> cohort = Cohort.from_patients(individuals)
227+
228+
We will assign them into genotype classes on random, ...
229+
230+
>>> from gpsea.analysis.clf import random_classifier
231+
>>> gt_clf = random_classifier(seed=123)
232+
>>> gt_clf.description
233+
'Classify the individual into random classes'
234+
235+
... using the Lynch syndrome I diagnosis as the endpoint ...
236+
237+
>>> from gpsea.analysis.temporal.endpoint import disease_onset
238+
>>> endpoint = disease_onset(disease_id="OMIM:120435")
239+
>>> endpoint.description
240+
'Compute time until OMIM:120435 onset'
241+
242+
... and we will use Logrank test for differences in survival.
243+
244+
>>> from gpsea.analysis.temporal.stats import LogRankTest
245+
>>> survival_statistic = LogRankTest()
246+
247+
We put together the survival analysis ...
248+
249+
>>> from gpsea.analysis.temporal import SurvivalAnalysis
250+
>>> survival_analysis = SurvivalAnalysis(
251+
... statistic=survival_statistic,
252+
... )
253+
254+
... which we expect to fail with an :class:`~gpsea.analysis.AnalysisException`:
255+
256+
>>> result = survival_analysis.compare_genotype_vs_survival(
257+
... cohort=cohort,
258+
... gt_clf=gt_clf,
259+
... endpoint=endpoint,
260+
... )
261+
Traceback (most recent call last):
262+
...
263+
gpsea.analysis._base.AnalysisException: The survival values did not meet the expectation of the statistical test!
264+
265+
The genotype classes and survival values can be retrieved from the exception:
266+
267+
>>> from gpsea.analysis import AnalysisException
268+
>>> try:
269+
... result = survival_analysis.compare_genotype_vs_survival(
270+
... cohort=cohort,
271+
... gt_clf=gt_clf,
272+
... endpoint=endpoint,
273+
... )
274+
... except AnalysisException as ae:
275+
... genotypes = ae.data["genotype"]
276+
... survivals = ae.data["survival"]
277+
278+
and the values can come in handy in troubleshooting:
279+
280+
>>> genotypes[:3]
281+
(0, 0, 0)
282+
>>> survivals[:3]
283+
(Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False))

src/gpsea/analysis/_base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,11 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta):
395395
that tested a single genotype-phenotype association.
396396
"""
397397

398+
SAMPLE_ID = "patient_id"
399+
"""
400+
Name of the data index.
401+
"""
402+
398403
GT_COL = "genotype"
399404
"""
400405
Name of column for storing genotype data.

src/gpsea/analysis/clf/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
monoallelic_classifier,
1010
biallelic_classifier,
1111
allele_count,
12+
random_classifier,
1213
)
1314
from ._util import (
1415
prepare_classifiers_for_terms_of_interest,
@@ -28,6 +29,7 @@
2829
"monoallelic_classifier",
2930
"biallelic_classifier",
3031
"allele_count",
32+
"random_classifier",
3133
"PhenotypeClassifier",
3234
"PhenotypeCategorization",
3335
"P",

src/gpsea/analysis/clf/_gt_classifiers.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import random
12
import typing
23

34
from collections import Counter
@@ -640,3 +641,62 @@ def diagnosis_classifier(
640641
diagnoses=diagnoses,
641642
labels=labels,
642643
)
644+
645+
646+
class RandomClassifier(GenotypeClassifier):
647+
648+
CATS = (
649+
Categorization(
650+
category=PatientCategory(
651+
cat_id=0, name="A",
652+
)
653+
),
654+
Categorization(
655+
category=PatientCategory(
656+
cat_id=1, name="B",
657+
)
658+
)
659+
)
660+
661+
def __init__(
662+
self,
663+
seed: typing.Optional[float] = None,
664+
):
665+
self._rng = random.Random(x=seed)
666+
667+
@property
668+
def name(self) -> str:
669+
return "Random Classifier"
670+
671+
@property
672+
def description(self) -> str:
673+
return "Classify the individual into random classes"
674+
675+
@property
676+
def variable_name(self) -> str:
677+
return "Randomness"
678+
679+
def get_categorizations(self) -> typing.Sequence[Categorization]:
680+
return RandomClassifier.CATS
681+
682+
def test(self, _: Patient) -> typing.Optional[Categorization]:
683+
return self._rng.choice(RandomClassifier.CATS)
684+
685+
def __eq__(self, value: object) -> bool:
686+
return isinstance(value, RandomClassifier) and self._rng == value._rng
687+
688+
def __hash__(self) -> int:
689+
return hash((self._rng, ))
690+
691+
692+
def random_classifier(
693+
seed: typing.Optional[float] = None,
694+
) -> GenotypeClassifier:
695+
"""
696+
Genotype classifier to assign an individual into one of two classes, `A` and `B` on random..
697+
698+
:param seed: the seed for the random number generator.
699+
"""
700+
return RandomClassifier(
701+
seed=seed,
702+
)

src/gpsea/analysis/temporal/_api.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,10 @@ def compare_genotype_vs_survival(
168168
Execute the survival analysis on a given `cohort`.
169169
"""
170170

171-
idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id")
171+
idx = pd.Index(
172+
(patient.patient_id for patient in cohort),
173+
name=MonoPhenotypeAnalysisResult.SAMPLE_ID,
174+
)
172175
data = pd.DataFrame(
173176
None,
174177
index=idx,
@@ -194,8 +197,14 @@ def compare_genotype_vs_survival(
194197
vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations())
195198
result = self._statistic.compute_pval(vals)
196199
if math.isnan(result.pval):
200+
partial = {
201+
MonoPhenotypeAnalysisResult.SAMPLE_ID: tuple(data.index),
202+
"genotype": tuple(data[MonoPhenotypeAnalysisResult.GT_COL]),
203+
"survival": tuple(data[MonoPhenotypeAnalysisResult.PH_COL]),
204+
}
205+
197206
raise AnalysisException(
198-
dict(data=data),
207+
partial,
199208
"The survival values did not meet the expectation of the statistical test!",
200209
)
201210

0 commit comments

Comments
 (0)