diff --git a/docs/user-guide/analyses/survival.rst b/docs/user-guide/analyses/survival.rst index 616bfa467..faef0bbb2 100644 --- a/docs/user-guide/analyses/survival.rst +++ b/docs/user-guide/analyses/survival.rst @@ -188,3 +188,96 @@ or `None` if computing the survival was impossible (see :func:`~gpsea.analysis.t The `Survival` reports the number of days until attaining the endpoint, here defined as end stage renal disease (`is_censored=False`), or until the individual dropped out of the analysis (`is_censored=True`). + + +Troubleshooting +=============== + +Sometimes the survival analysis fails and an :class:`~gpsea.analysis.AnalysisException` is raised. +For instance, the current Logrank test implementation reports a p value of `NaN` +if the survival is the same for all individuals. +This is unlikely an expected outcome, therefore GPSEA raises +an :class:`~gpsea.analysis.AnalysisException` to force the user to troubleshoot. + +To help with troubleshooting, the data computed prior detecting the error is included in the exception's +:attr:`~gpsea.analysis.AnalysisException.data` attribute. In survival analysis, the data should include +the identifiers, genotype classes, and survivals of the tested individuals. + +Let's show this on an example. We will create a toy cohort of 10 individuals +with onset of `Lynch syndrome I `_ +(`OMIM:120435`) at 40 years. + +>>> from gpsea.model import Cohort, Patient, Disease, Age +>>> onset = Age.from_iso8601_period("P40Y") +>>> individuals = [ +... Patient.from_raw_parts( +... labels=label, +... diseases=( +... Disease.from_raw_parts( +... term_id="OMIM:120435", +... name="Lynch syndrome I", +... is_observed=True, +... onset=onset, +... ), +... ), +... ) +... for label in "ABCDEFGHIJ" # 10 individuals +... ] +>>> cohort = Cohort.from_patients(individuals) + +We will assign them into genotype classes on random, ... + +>>> from gpsea.analysis.clf import random_classifier +>>> gt_clf = random_classifier(seed=123) +>>> gt_clf.description +'Classify the individual into random classes' + +... using the Lynch syndrome I diagnosis as the endpoint ... + +>>> from gpsea.analysis.temporal.endpoint import disease_onset +>>> endpoint = disease_onset(disease_id="OMIM:120435") +>>> endpoint.description +'Compute time until OMIM:120435 onset' + +... and we will use Logrank test for differences in survival. + +>>> from gpsea.analysis.temporal.stats import LogRankTest +>>> survival_statistic = LogRankTest() + +We put together the survival analysis ... + +>>> from gpsea.analysis.temporal import SurvivalAnalysis +>>> survival_analysis = SurvivalAnalysis( +... statistic=survival_statistic, +... ) + +... which we expect to fail with an :class:`~gpsea.analysis.AnalysisException`: + +>>> result = survival_analysis.compare_genotype_vs_survival( +... cohort=cohort, +... gt_clf=gt_clf, +... endpoint=endpoint, +... ) +Traceback (most recent call last): + ... +gpsea.analysis._base.AnalysisException: The survival values did not meet the expectation of the statistical test! + +The genotype classes and survival values can be retrieved from the exception: + +>>> from gpsea.analysis import AnalysisException +>>> try: +... result = survival_analysis.compare_genotype_vs_survival( +... cohort=cohort, +... gt_clf=gt_clf, +... endpoint=endpoint, +... ) +... except AnalysisException as ae: +... genotypes = ae.data["genotype"] +... survivals = ae.data["survival"] + +and the values can come in handy in troubleshooting: + +>>> genotypes[:3] +(0, 0, 0) +>>> survivals[:3] +(Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False), Survival(value=14610.0, is_censored=False)) diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py index 1725cf9e8..72aa54483 100644 --- a/src/gpsea/analysis/_base.py +++ b/src/gpsea/analysis/_base.py @@ -395,6 +395,11 @@ class MonoPhenotypeAnalysisResult(AnalysisResult, metaclass=abc.ABCMeta): that tested a single genotype-phenotype association. """ + SAMPLE_ID = "patient_id" + """ + Name of the data index. + """ + GT_COL = "genotype" """ Name of column for storing genotype data. diff --git a/src/gpsea/analysis/clf/__init__.py b/src/gpsea/analysis/clf/__init__.py index 3cf48eda7..671596bee 100644 --- a/src/gpsea/analysis/clf/__init__.py +++ b/src/gpsea/analysis/clf/__init__.py @@ -9,6 +9,7 @@ monoallelic_classifier, biallelic_classifier, allele_count, + random_classifier, ) from ._util import ( prepare_classifiers_for_terms_of_interest, @@ -28,6 +29,7 @@ "monoallelic_classifier", "biallelic_classifier", "allele_count", + "random_classifier", "PhenotypeClassifier", "PhenotypeCategorization", "P", diff --git a/src/gpsea/analysis/clf/_gt_classifiers.py b/src/gpsea/analysis/clf/_gt_classifiers.py index 51a4ce289..c7d608733 100644 --- a/src/gpsea/analysis/clf/_gt_classifiers.py +++ b/src/gpsea/analysis/clf/_gt_classifiers.py @@ -1,3 +1,4 @@ +import random import typing from collections import Counter @@ -640,3 +641,62 @@ def diagnosis_classifier( diagnoses=diagnoses, labels=labels, ) + + +class RandomClassifier(GenotypeClassifier): + + CATS = ( + Categorization( + category=PatientCategory( + cat_id=0, name="A", + ) + ), + Categorization( + category=PatientCategory( + cat_id=1, name="B", + ) + ) + ) + + def __init__( + self, + seed: typing.Optional[float] = None, + ): + self._rng = random.Random(x=seed) + + @property + def name(self) -> str: + return "Random Classifier" + + @property + def description(self) -> str: + return "Classify the individual into random classes" + + @property + def variable_name(self) -> str: + return "Randomness" + + def get_categorizations(self) -> typing.Sequence[Categorization]: + return RandomClassifier.CATS + + def test(self, _: Patient) -> typing.Optional[Categorization]: + return self._rng.choice(RandomClassifier.CATS) + + def __eq__(self, value: object) -> bool: + return isinstance(value, RandomClassifier) and self._rng == value._rng + + def __hash__(self) -> int: + return hash((self._rng, )) + + +def random_classifier( + seed: typing.Optional[float] = None, +) -> GenotypeClassifier: + """ + Genotype classifier to assign an individual into one of two classes, `A` and `B` on random.. + + :param seed: the seed for the random number generator. + """ + return RandomClassifier( + seed=seed, + ) diff --git a/src/gpsea/analysis/temporal/_api.py b/src/gpsea/analysis/temporal/_api.py index 0ecffec10..831a3365e 100644 --- a/src/gpsea/analysis/temporal/_api.py +++ b/src/gpsea/analysis/temporal/_api.py @@ -168,7 +168,10 @@ def compare_genotype_vs_survival( Execute the survival analysis on a given `cohort`. """ - idx = pd.Index((patient.patient_id for patient in cohort), name="patient_id") + idx = pd.Index( + (patient.patient_id for patient in cohort), + name=MonoPhenotypeAnalysisResult.SAMPLE_ID, + ) data = pd.DataFrame( None, index=idx, @@ -194,8 +197,14 @@ def compare_genotype_vs_survival( vals = tuple(survivals[gt_cat] for gt_cat in gt_clf.get_categorizations()) result = self._statistic.compute_pval(vals) if math.isnan(result.pval): + partial = { + MonoPhenotypeAnalysisResult.SAMPLE_ID: tuple(data.index), + "genotype": tuple(data[MonoPhenotypeAnalysisResult.GT_COL]), + "survival": tuple(data[MonoPhenotypeAnalysisResult.PH_COL]), + } + raise AnalysisException( - dict(data=data), + partial, "The survival values did not meet the expectation of the statistical test!", )