From 291fb753c5cc1abd18948a82649c3874c1606b98 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 10:33:58 +0100 Subject: [PATCH 1/7] Improve error message when age cannot be parsed #393. --- src/gpsea/preprocessing/_phenopacket.py | 32 ++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py index c7d90fb8e..227d17640 100644 --- a/src/gpsea/preprocessing/_phenopacket.py +++ b/src/gpsea/preprocessing/_phenopacket.py @@ -128,6 +128,7 @@ def find_coordinates( f"{variation_descriptor.vcf_record.genome_assembly} is not {self._build.identifier}." ) contig = self._build.contig_by_name(variation_descriptor.vcf_record.chrom) + assert contig is not None start = int(variation_descriptor.vcf_record.pos) - 1 ref = variation_descriptor.vcf_record.ref alt = variation_descriptor.vcf_record.alt @@ -142,6 +143,7 @@ def find_coordinates( seq_location = variation.copy_number.allele.sequence_location refseq_contig_name = seq_location.sequence_id.split(":")[1] contig = self._build.contig_by_name(refseq_contig_name) + assert contig is not None # Assuming SV coordinates are 1-based (VCF style), # so we subtract 1 to transform to 0-based coordinate system @@ -591,7 +593,11 @@ def _extract_age( ) -> typing.Optional[Age]: if individual.HasField("time_at_last_encounter"): tale = individual.time_at_last_encounter - return parse_age_element(tale, notepad) + return parse_age_element( + 'time_at_last_encounter', + tale, + notepad, + ) return None @staticmethod @@ -614,6 +620,7 @@ def _extract_vital_status( if vital_status.HasField("time_of_death"): age_of_death = parse_age_element( + 'time_of_death', time_element=vital_status.time_of_death, notepad=notepad, ) @@ -904,21 +911,27 @@ def parse_onset_element( We allow to use `GestationalAge`, `Age` or `OntologyClass` as onset. """ case = time_element.WhichOneof("element") - if case == "ontology_class": + if case == "age": + age = time_element.age + return Age.from_iso8601_period(value=age.iso8601duration) + elif case == "gestational_age": + age = time_element.gestational_age + return Age.gestational(weeks=age.weeks, days=age.days) + elif case == "ontology_class": if term_onset_parser is None: return None else: return term_onset_parser.process( ontology_class=time_element.ontology_class, notepad=notepad, - ) + ) else: - return parse_age_element( - time_element=time_element, - notepad=notepad, - ) + notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`") + return None + def parse_age_element( + field: str, time_element: PPTimeElement, notepad: Notepad, ) -> typing.Optional[Age]: @@ -933,5 +946,8 @@ def parse_age_element( age = time_element.age return Age.from_iso8601_period(value=age.iso8601duration) else: - notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`") + notepad.add_warning( + f"{case} of the {field} field cannot be parsed into age", + "Consider formatting the age as ISO8601 duration (e.g., \"P31Y2M\" for 31 years and 2 months)" + ) return None From 0da395530463984a7eaa675c8c8f13899680fb47 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 10:46:05 +0100 Subject: [PATCH 2/7] The median line color can be tweaked in phenotype score analysis boxplots #397. --- src/gpsea/analysis/pscore/_api.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py index 72049f04e..5403e9daa 100644 --- a/src/gpsea/analysis/pscore/_api.py +++ b/src/gpsea/analysis/pscore/_api.py @@ -141,13 +141,15 @@ def plot_boxplots( self, ax, colors=("darksalmon", "honeydew"), + median_color: str = "red", ): """ Draw box plot with distributions of phenotype scores for the genotype groups. :param gt_predicate: the genotype predicate used to produce the genotype groups. :param ax: the Matplotlib :class:`~matplotlib.axes.Axes` to draw on. - :param colors: a tuple with colors to use for coloring the box patches of the box plot. + :param colors: a sequence with colors to use for coloring the box patches of the box plot. + :param median_color: a `str` with the color for the boxplot median line. """ # skip the patients with unassigned genotype group bla = self._data.notna() @@ -175,9 +177,13 @@ def plot_boxplots( tick_labels=gt_cat_names, ) + # Set face colors of the boxes for patch, color in zip(bplot["boxes"], colors): patch.set_facecolor(color) + for median in bplot['medians']: + median.set_color(median_color) + def __eq__(self, value: object) -> bool: return isinstance(value, PhenotypeScoreAnalysisResult) and super( MonoPhenotypeAnalysisResult, self From 8822bc11ffb6289515370d5612712a916f5c5693 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 11:09:57 +0100 Subject: [PATCH 3/7] The boxplot median line is black by default #397 --- src/gpsea/analysis/pscore/_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py index 5403e9daa..27a4279ab 100644 --- a/src/gpsea/analysis/pscore/_api.py +++ b/src/gpsea/analysis/pscore/_api.py @@ -141,7 +141,7 @@ def plot_boxplots( self, ax, colors=("darksalmon", "honeydew"), - median_color: str = "red", + median_color: str = "black", ): """ Draw box plot with distributions of phenotype scores for the genotype groups. From 0607322cedd0f1920b542e073c52d1c215f75d12 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 12:25:59 +0100 Subject: [PATCH 4/7] Add guide for writing custom components #396 --- docs/user-guide/analyses/phenotype-scores.rst | 20 +++++ docs/user-guide/custom-components.rst | 76 +++++++++++++++++++ docs/user-guide/index.rst | 1 + 3 files changed, 97 insertions(+) create mode 100644 docs/user-guide/custom-components.rst diff --git a/docs/user-guide/analyses/phenotype-scores.rst b/docs/user-guide/analyses/phenotype-scores.rst index 5390e3ff0..502a59f52 100644 --- a/docs/user-guide/analyses/phenotype-scores.rst +++ b/docs/user-guide/analyses/phenotype-scores.rst @@ -156,6 +156,26 @@ Phenotype score This component is responsible for computing a phenotype score for an individual. As far as GPSEA framework is concerned, the phenotype score must be a floating point number or a `NaN` value if the score cannot be computed for an individual. +This is the essence of the :class:`~gpsea.analysis.pscore.PhenotypeScorer` class. + +GPSEA ships with several builtin phenotype scorers which can be used as + ++------------------------------------------------------------+---------------------------------------------+ +| Name | Description | ++============================================================+=============================================+ +| | Compute the total number of occurrences | +| * :class:`~gpsea.analysis.pscore.CountingPhenotypeScorer` | of specific phenotypic features | +| | (used in this section) | ++------------------------------------------------------------+---------------------------------------------+ +| | Compute the "adapted De Vries Score" | +| * :class:`~gpsea.analysis.pscore.DeVriesPhenotypeScorer` | for assessing severity | +| | of intellectual disability | ++------------------------------------------------------------+---------------------------------------------+ + +.. tip:: + + See :ref:`custom-phenotype-scorer` section to learn how to build a phenotype scorer from scratch. + Here we use the :class:`~gpsea.analysis.pscore.CountingPhenotypeScorer` for scoring the individuals based on the number of structural defects diff --git a/docs/user-guide/custom-components.rst b/docs/user-guide/custom-components.rst new file mode 100644 index 000000000..15b19ebe4 --- /dev/null +++ b/docs/user-guide/custom-components.rst @@ -0,0 +1,76 @@ +.. _custom-components: + +################# +Custom components +################# + +GPSEA aims to stay useful in the long run. Therefore, we took a great care +to adhere to "good software development practices" and we framed GPSEA functionalities +as a set of loosely coupled components. As a rule of thumb, each component corresponds +to a Python abstract base class which is then extended by the builtin components +and can also be extended by the future components, to serve both common or exotic use cases. + +The abstract base classes define the component API. +Per guidelines in Python's :mod:`abc` module, the abstract classes use :class:`abc.ABCMeta` as a metaclass +and the class API consists of methods annotated with the :func:`abc.abstractmethod` decorator. +These decorated methods must be overridden in the subclasses. + +The following sections provide guidance for customizing the most commonly used GPSEA components. + + +.. _custom-phenotype-scorer: + +**************** +Phenotype scorer +**************** + +:class:`~gpsea.analysis.pscore.PhenotypeScorer` computes a phenotype score for an individual. +The phenotype score is a `float` with range :math:`(-\infty, \infty)` where `NaN` indicates +that a score cannot be computed (e.g. the lab measurement value was not ascertained for the individual). + +Here we show an example of a toy phenotype scorer +for using body mass index (BMI) as a phenotype score. + +>>> import typing +>>> from gpsea.model import Patient +>>> from gpsea.analysis.pscore import PhenotypeScorer +>>> class BmiScorer(PhenotypeScorer): # ❶ +... +... def __init__( # ❷ +... self, +... id2bmi: typing.Mapping[str, float], +... ): +... self._id2bmi = id2bmi +... +... @property +... def name(self) -> str: # ❸ +... return "BMI phenotype scorer" +... +... @property +... def description(self) -> str: # ❹ +... return "Body mass index used as a phenotype score" +... +... @property +... def variable_name(self) -> str: # ❺ +... return "BMI" +... +... def score(self, patient: Patient) -> float: # ❻ +... try: +... return self._id2bmi[patient.labels.label] +... except KeyError: +... return float('nan') + +❶ The ``BmiScorer`` must extend :class:`~gpsea.analysis.pscore.PhenotypeScorer` +to be used as a phenotype scorer. +❷ The scorer needs a ``dict`` with `label` → `BMI` for the analyzed individuals. +We assume the user will pre-compute the corresponding ``dict``. + +Then, the scorer must expose several properties, including ❸ ``name``, ❹ ``description``, +and the ❺ ``variable_name`` it operates on. +The properties provide bookkeeping metadata to use in e.g. visualizations. +Try to choose short and concise names. + +The most important part of the scorer is the ❻ `score` method +which retrieves the BMI for an individual or returns `NaN` if the value is not available +and the individual should be omitted from the analysis. + diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index 2a3c92cc3..23b9e9718 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -28,4 +28,5 @@ in an interactive Python environment, such as Jupyter notebook. input-data exploratory analyses/index + custom-components glossary From 369989888cdafbcf955129ed0484318966b54ebe Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 13:04:21 +0100 Subject: [PATCH 5/7] Add guide for writing custom `VariantPredicate` #326 --- .../genotype/variant_predicates.rst | 14 ++-- docs/user-guide/custom-components.rst | 66 +++++++++++++++++++ 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst index 4fbca7dfe..36287cae3 100644 --- a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst +++ b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst @@ -185,12 +185,16 @@ The builtin predicates should cover majority of use cases. However, if a predicate seems to be missing, feel free to submit an issue in our `GitHub tracker `_, -or to implement a custom predicate -by extending the :class:`~gpsea.analysis.predicate.VariantPredicate` class 😎. +or implement your own predicate by following the :ref:`custom-variant-predicate` +guide. +**** +Next +**** The variant predicate offers a flexible API for testing if variants meet a condition. -However, the genotype phenotype correlations are done on the individual level -and the variant predicates are used as a component of the genotype predicate. -The next sections show how to use variant predicates to assign individuals into groups. +However, the genotype phenotype correlations are studied on the level of individuals. +As described in :ref:`genotype-classifiers`, GPSEA uses the :class:`~gpsea.analysis.clf.GenotypeClassifier` API +to assign individuals into non-overlapping classes. Variant predicates are essential for creating such classifier. +We explain the details in the following sections. diff --git a/docs/user-guide/custom-components.rst b/docs/user-guide/custom-components.rst index 15b19ebe4..af851a2c5 100644 --- a/docs/user-guide/custom-components.rst +++ b/docs/user-guide/custom-components.rst @@ -74,3 +74,69 @@ The most important part of the scorer is the ❻ `score` method which retrieves the BMI for an individual or returns `NaN` if the value is not available and the individual should be omitted from the analysis. +.. _custom-variant-predicate: + +***************** +Variant predicate +***************** + +The purpose of a :class:`~gpsea.analysis.predicate.VariantPredicate` is to test +if a variant meets a certain criterion and GPSEA ships with an array +of builtin predicates (see :mod:`gpsea.analysis.predicate` module). +However, chances are a custom predicate will be needed in future, +so we show how to how to extend +the :class:`~gpsea.analysis.predicate.VariantPredicate` class +to create one's own predicate. + +Specifically, we show how to create a predicate to test if the variant affects a glycine residue +of the transcript of interest. + +>>> from gpsea.model import Variant, VariantEffect +>>> from gpsea.analysis.predicate import VariantPredicate +>>> class AffectsGlycinePredicate(VariantPredicate): # ❶ +... def __init__( # ❷ +... self, +... tx_id: str, +... ): +... self._tx_id = tx_id +... self._aa_code = "Gly" +... +... @property +... def name(self) -> str: # ❸ +... return "Affects Glycine" +... +... @property +... def description(self) -> str: # ❹ +... return "affects a glycine residue" +... +... @property +... def variable_name(self) -> str: # ❺ +... return "affected aminoacid residue" +... +... def test(self, variant: Variant) -> bool: # ❻ +... tx_ann = variant.get_tx_anno_by_tx_id(self._tx_id) +... if tx_ann is not None: +... hgvsp = tx_ann.hgvsp +... if hgvsp is not None: +... return hgvsp.startswith(f"p.{self._aa_code}") +... return False +... +... def __eq__(self, value: object) -> bool: # ➐ +... return isinstance(value, AffectsGlycinePredicate) and self._tx_id == value._tx_id +... +... def __hash__(self) -> int: # ❽ +... return hash((self._tx_id,)) +... +... def __repr__(self) -> str: # ❾ +... return str(self) +... +... def __str__(self) -> str: # ➓ +... return f"AffectsGlycinePredicate(tx_id={self._tx_id})" + +❶ The ``AffectsGlycinePredicate`` must extend :class:`~gpsea.analysis.predicate.VariantPredicate`. +❷ We ask the user to provide the transcript accession `str` and we set the target aminoacid code to glycine ``Gly``. +Like in the :ref:`custom-phenotype-scorer` above, ❸❹❺ provide metadata required for the bookkeeping. +The ❻ ``test`` method includes the most interesting part - we retrieve the :class:`~gpsea.model.TranscriptAnnotation` +with the functional annotation data for the transcript of interest, and we test if the HGVS protein indicates +that the reference aminoacid is glycine. +Last, we override ➐ ``__eq__()`` and ❽ ``__hash__()`` (required) as well as ❾ ``__repr__()`` and ➓ ``__str__()`` (recommended). From ba0487d54643268cfca779e162533b63bf7f2612 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 14:15:59 +0100 Subject: [PATCH 6/7] Include setup link in `README.md`. --- README.md | 7 +++---- docs/setup.rst | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ace2b4572..7715f7958 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,9 @@ GPSEA (Genotypes and Phenotypes - Statistical Evaluation of Associations) is a Python package for finding genotype-phenotype associations. - -See the [Tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) -and a comprehensive [User guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) -for more information. +See our documentation for the [setup](https://monarch-initiative.github.io/gpsea/stable/setup.html) instructions, +a [tutorial](https://monarch-initiative.github.io/gpsea/stable/tutorial.html) with an end-to-end genotype-phenotype association analysis, +and a comprehensive [user guide](https://monarch-initiative.github.io/gpsea/stable/user-guide/index.html) with everything else. The documentation comes in two flavors: diff --git a/docs/setup.rst b/docs/setup.rst index 1be54561b..71eac1804 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -4,8 +4,8 @@ Setup ##### -Here we show how to install GPSEA and to prepare your Python environment -for genotype-phenotype association analysis. +Here we show how to install GPSEA and prepare your Python environment +for genotype-phenotype association analyses. .. contents:: Table of Contents From bf691efa0eb0134aa8f56840271dce32a757f38d Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 15 Jan 2025 14:25:34 +0100 Subject: [PATCH 7/7] Add missing whitespace #388 --- src/gpsea/preprocessing/_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 3de08f8e8..1294eb5e6 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -495,7 +495,7 @@ def load_phenopackets( # Keep track of the progress by wrapping the list of phenopackets # with TQDM 😎 cohort_iter = tqdm( - phenopackets, desc="Individuals Processed", file=sys.stdout, unit="individuals" + phenopackets, desc="Individuals Processed", file=sys.stdout, unit=" individuals" ) notepad = create_notepad(label="Phenopackets") cohort = cohort_creator.process(cohort_iter, notepad)