diff --git a/alembic/versions/1e08b947679d_add_functional_consequence.py b/alembic/versions/1e08b947679d_add_functional_consequence.py new file mode 100644 index 00000000..68461196 --- /dev/null +++ b/alembic/versions/1e08b947679d_add_functional_consequence.py @@ -0,0 +1,30 @@ +"""Add functional consequence + +Revision ID: 1e08b947679d +Revises: 019eb75ad9ae +Create Date: 2025-09-17 11:15:52.091271 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "1e08b947679d" +down_revision = "019eb75ad9ae" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("mapped_variants", sa.Column("vep_functional_consequence", sa.String(), nullable=True)) + op.add_column("mapped_variants", sa.Column("vep_access_date", sa.Date(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("mapped_variants", "vep_access_date") + op.drop_column("mapped_variants", "vep_functional_consequence") + # ### end Alembic commands ### diff --git a/alembic/versions/b22b450d409c_add_mapped_hgvs.py b/alembic/versions/b22b450d409c_add_mapped_hgvs.py new file mode 100644 index 00000000..42bd6ecb --- /dev/null +++ b/alembic/versions/b22b450d409c_add_mapped_hgvs.py @@ -0,0 +1,34 @@ +"""Add mapped hgvs + +Revision ID: b22b450d409c +Revises: 1e08b947679d +Create Date: 2025-10-09 09:53:47.903249 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "b22b450d409c" +down_revision = "1e08b947679d" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("mapped_variants", sa.Column("hgvs_assay_level", sa.String(), nullable=True)) + op.add_column("mapped_variants", sa.Column("hgvs_g", sa.String(), nullable=True)) + op.add_column("mapped_variants", sa.Column("hgvs_c", sa.String(), nullable=True)) + op.add_column("mapped_variants", sa.Column("hgvs_p", sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("mapped_variants", "hgvs_p") + op.drop_column("mapped_variants", "hgvs_c") + op.drop_column("mapped_variants", "hgvs_g") + op.drop_column("mapped_variants", "hgvs_assay_level") + # ### end Alembic commands ### diff --git a/src/mavedb/lib/clingen/allele_registry.py b/src/mavedb/lib/clingen/allele_registry.py index ee26028f..5e025b14 100644 --- a/src/mavedb/lib/clingen/allele_registry.py +++ b/src/mavedb/lib/clingen/allele_registry.py @@ -4,7 +4,7 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) -CLINGEN_API_URL = "https://reg.test.genome.network/allele" +CLINGEN_API_URL = "https://reg.genome.network/allele" def get_canonical_pa_ids(clingen_allele_id: str) -> list[str]: diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 5b5fb740..f3d45de6 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -25,7 +25,7 @@ from mavedb.lib.mave.utils import is_csv_null from mavedb.lib.validation.constants.general import null_values_list from mavedb.lib.validation.utilities import is_null as validate_is_null -from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p +from mavedb.lib.variants import get_digest_from_post_mapped from mavedb.models.contributor import Contributor from mavedb.models.controlled_keyword import ControlledKeyword from mavedb.models.doi_identifier import DoiIdentifier @@ -35,6 +35,7 @@ from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation from mavedb.models.experiment_set import ExperimentSet +from mavedb.models.gnomad_variant import GnomADVariant from mavedb.models.mapped_variant import MappedVariant from mavedb.models.publication_identifier import PublicationIdentifier from mavedb.models.refseq_identifier import RefseqIdentifier @@ -501,7 +502,7 @@ def find_publish_or_private_superseded_score_set_tail( def get_score_set_variants_as_csv( db: Session, score_set: ScoreSet, - namespaces: List[Literal["scores", "counts"]], + namespaces: List[Literal["scores", "counts", "vep", "gnomad"]], namespaced: Optional[bool] = None, start: Optional[int] = None, limit: Optional[int] = None, @@ -518,8 +519,8 @@ def get_score_set_variants_as_csv( The database session to use. score_set : ScoreSet The score set to get the variants from. - namespaces : List[Literal["scores", "counts"]] - The namespaces for data. Now there are only scores and counts. There will be ClinVar and gnomAD. + namespaces : List[Literal["scores", "counts", "vep", "gnomad"]] + The namespaces for data. Now there are only scores, counts, VEP, and gnomAD. ClinVar will be added in the future. namespaced: Optional[bool] = None Whether namespace the columns or not. start : int, optional @@ -531,8 +532,8 @@ def get_score_set_variants_as_csv( include_custom_columns : bool, optional Whether to include custom columns defined in the score set. Defaults to True. include_post_mapped_hgvs : bool, optional - Whether to include post-mapped HGVS notations in the output. Defaults to False. If True, the output will include - columns for both post-mapped HGVS genomic (g.) and protein (p.) notations. + Whether to include post-mapped HGVS notations and VEP functional consequence in the output. Defaults to False. If True, the output will include + columns for post-mapped HGVS genomic (g.) and protein (p.) notations, and VEP functional consequence. Returns _______ @@ -547,9 +548,12 @@ def get_score_set_variants_as_csv( if include_post_mapped_hgvs: namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g") namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p") + namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_c") + namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_at_assay_level") namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest") for namespace in namespaces: namespaced_score_set_columns[namespace] = [] + if include_custom_columns: if "scores" in namespaced_score_set_columns: namespaced_score_set_columns["scores"] = [ @@ -561,10 +565,51 @@ def get_score_set_variants_as_csv( ] elif "scores" in namespaced_score_set_columns: namespaced_score_set_columns["scores"].append(REQUIRED_SCORE_COLUMN) + if "vep" in namespaced_score_set_columns: + namespaced_score_set_columns["vep"].append("vep_functional_consequence") + if "gnomad" in namespaced_score_set_columns: + namespaced_score_set_columns["gnomad"].append("gnomad_af") variants: Sequence[Variant] = [] mappings: Optional[list[Optional[MappedVariant]]] = None + gnomad_data: Optional[list[Optional[GnomADVariant]]] = None - if include_post_mapped_hgvs: + if "gnomad" in namespaces and include_post_mapped_hgvs: + variants_mappings_and_gnomad_query = ( + select(Variant, MappedVariant, GnomADVariant) + .join( + MappedVariant, + and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)), + isouter=True, + ) + .join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True) + .where( + and_( + Variant.score_set_id == score_set.id, + or_( + and_( + GnomADVariant.db_name == "gnomAD", + GnomADVariant.db_version == "v4.1", + ), + GnomADVariant.id.is_(None), + ), + ) + ) + .order_by(cast(func.split_part(Variant.urn, "#", 2), Integer)) + ) + if start: + variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.offset(start) + if limit: + variants_mappings_and_gnomad_query = variants_mappings_and_gnomad_query.limit(limit) + variants_mappings_and_gnomad = db.execute(variants_mappings_and_gnomad_query).all() + + variants = [] + mappings = [] + gnomad_data = [] + for variant, mapping, gnomad in variants_mappings_and_gnomad: + variants.append(variant) + mappings.append(mapping) + gnomad_data.append(gnomad) + elif include_post_mapped_hgvs: variants_and_mappings_query = ( select(Variant, MappedVariant) .join( @@ -586,6 +631,40 @@ def get_score_set_variants_as_csv( for variant, mapping in variants_and_mappings: variants.append(variant) mappings.append(mapping) + elif "gnomad" in namespaces: + variants_and_gnomad_query = ( + select(Variant, GnomADVariant) + .join( + MappedVariant, + and_(Variant.id == MappedVariant.variant_id, MappedVariant.current.is_(True)), + isouter=True, + ) + .join(MappedVariant.gnomad_variants.of_type(GnomADVariant), isouter=True) + .where( + and_( + Variant.score_set_id == score_set.id, + or_( + and_( + GnomADVariant.db_name == "gnomAD", + GnomADVariant.db_version == "v4.1", + ), + GnomADVariant.id.is_(None), + ), + ) + ) + .order_by(cast(func.split_part(Variant.urn, "#", 2), Integer)) + ) + if start: + variants_and_gnomad_query = variants_and_gnomad_query.offset(start) + if limit: + variants_and_gnomad_query = variants_and_gnomad_query.limit(limit) + variants_and_gnomad = db.execute(variants_and_gnomad_query).all() + + variants = [] + gnomad_data = [] + for variant, gnomad in variants_and_gnomad: + variants.append(variant) + gnomad_data.append(gnomad) else: variants_query = ( select(Variant) @@ -598,7 +677,11 @@ def get_score_set_variants_as_csv( variants_query = variants_query.limit(limit) variants = db.scalars(variants_query).all() rows_data = variants_to_csv_rows( - variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings + variants, + columns=namespaced_score_set_columns, + namespaced=namespaced, + mappings=mappings, + gnomad_data=gnomad_data, ) # type: ignore rows_columns = [ ( @@ -654,6 +737,7 @@ def variant_to_csv_row( variant: Variant, columns: dict[str, list[str]], mapping: Optional[MappedVariant] = None, + gnomad_data: Optional[GnomADVariant] = None, namespaced: Optional[bool] = None, na_rep="NA", ) -> dict[str, Any]: @@ -668,6 +752,10 @@ def variant_to_csv_row( Columns to serialize. namespaced: Optional[bool] = None Namespace the columns or not. + mapping : variant.models.MappedVariant, optional + Mapped variant corresponding to the variant. + gnomad_data : variant.models.GnomADVariant, optional + gnomAD variant data corresponding to the variant. na_rep : str String to represent null values. @@ -693,24 +781,29 @@ def variant_to_csv_row( row[column_key] = value for column_key in columns.get("mavedb", []): if column_key == "post_mapped_hgvs_g": - hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None - if hgvs_str is not None and is_hgvs_g(hgvs_str): - value = hgvs_str - else: - value = "" + value = str(mapping.hgvs_g) if mapping and mapping.hgvs_g else na_rep elif column_key == "post_mapped_hgvs_p": - hgvs_str = get_hgvs_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None - if hgvs_str is not None and is_hgvs_p(hgvs_str): - value = hgvs_str - else: - value = "" + value = str(mapping.hgvs_p) if mapping and mapping.hgvs_p else na_rep + elif column_key == "post_mapped_hgvs_c": + value = str(mapping.hgvs_c) if mapping and mapping.hgvs_c else na_rep + elif column_key == "post_mapped_hgvs_at_assay_level": + value = str(mapping.hgvs_assay_level) if mapping and mapping.hgvs_assay_level else na_rep elif column_key == "post_mapped_vrs_digest": digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None - value = digest if digest is not None else "" + value = digest if digest is not None else na_rep if is_null(value): value = na_rep key = f"mavedb.{column_key}" if namespaced else column_key row[key] = value + for column_key in columns.get("vep", []): + if column_key == "vep_functional_consequence": + vep_functional_consequence = mapping.vep_functional_consequence if mapping else None + if vep_functional_consequence is not None: + value = vep_functional_consequence + else: + value = na_rep + key = f"vep.{column_key}" if namespaced else column_key + row[key] = value for column_key in columns.get("scores", []): parent = variant.data.get("score_data") if variant.data else None value = str(parent.get(column_key)) if parent else na_rep @@ -721,6 +814,15 @@ def variant_to_csv_row( value = str(parent.get(column_key)) if parent else na_rep key = f"counts.{column_key}" if namespaced else column_key row[key] = value + for column_key in columns.get("gnomad", []): + if column_key == "gnomad_af": + gnomad_af = gnomad_data.allele_frequency if gnomad_data else None + if gnomad_af is not None: + value = str(gnomad_af) + else: + value = na_rep + key = f"gnomad.{column_key}" if namespaced else column_key + row[key] = value return row @@ -728,6 +830,7 @@ def variants_to_csv_rows( variants: Sequence[Variant], columns: dict[str, list[str]], mappings: Optional[Sequence[Optional[MappedVariant]]] = None, + gnomad_data: Optional[Sequence[Optional[GnomADVariant]]] = None, namespaced: Optional[bool] = None, na_rep="NA", ) -> Iterable[dict[str, Any]]: @@ -742,6 +845,10 @@ def variants_to_csv_rows( Columns to serialize. namespaced: Optional[bool] = None Namespace the columns or not. + mappings : list[Optional[variant.models.MappedVariant]], optional + List of mapped variants corresponding to the variants. + gnomad_data : list[Optional[variant.models.GnomADVariant]], optional + List of gnomAD variant data corresponding to the variants. na_rep : str String to represent null values. @@ -749,11 +856,25 @@ def variants_to_csv_rows( ------- list[dict[str, Any]] """ - if mappings is not None: + if mappings is not None and gnomad_data is not None: + return map( + lambda zipped: variant_to_csv_row( + zipped[0], columns, mapping=zipped[1], gnomad_data=zipped[2], namespaced=namespaced, na_rep=na_rep + ), + zip(variants, mappings, gnomad_data), + ) + elif mappings is not None: return map( lambda pair: variant_to_csv_row(pair[0], columns, mapping=pair[1], namespaced=namespaced, na_rep=na_rep), zip(variants, mappings), ) + elif gnomad_data is not None: + return map( + lambda pair: variant_to_csv_row( + pair[0], columns, gnomad_data=pair[1], namespaced=namespaced, na_rep=na_rep + ), + zip(variants, gnomad_data), + ) return map(lambda v: variant_to_csv_row(v, columns, namespaced=namespaced, na_rep=na_rep), variants) diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py index 0372f53c..e5b307cc 100644 --- a/src/mavedb/models/mapped_variant.py +++ b/src/mavedb/models/mapped_variant.py @@ -34,6 +34,15 @@ class MappedVariant(Base): clingen_allele_id = Column(String, index=True, nullable=True) + vep_functional_consequence = Column(String, nullable=True) + vep_access_date = Column(Date, nullable=True) + + # mapped hgvs + hgvs_assay_level = Column(String, nullable=True) + hgvs_g = Column(String, nullable=True) + hgvs_c = Column(String, nullable=True) + hgvs_p = Column(String, nullable=True) + clinical_controls: Mapped[list["ClinicalControl"]] = relationship( "ClinicalControl", secondary=mapped_variants_clinical_controls_association_table, diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py index ea174ade..589543d8 100644 --- a/src/mavedb/routers/score_sets.py +++ b/src/mavedb/routers/score_sets.py @@ -715,8 +715,8 @@ def get_score_set_variants_csv( urn: str, start: int = Query(default=None, description="Start index for pagination"), limit: int = Query(default=None, description="Maximum number of variants to return"), - namespaces: List[Literal["scores", "counts"]] = Query( - default=["scores"], description="One or more data types to include: scores, counts, clinVar, gnomAD" + namespaces: List[Literal["scores", "counts", "vep", "gnomad"]] = Query( + default=["scores"], description="One or more data types to include: scores, counts, clinVar, gnomAD, VEP" ), drop_na_columns: Optional[bool] = None, include_custom_columns: Optional[bool] = None, @@ -741,9 +741,9 @@ def get_score_set_variants_csv( The index to start from. If None, starts from the beginning. limit : Optional[int] The maximum number of variants to return. If None, returns all variants. - namespaces: List[Literal["scores", "counts"]] + namespaces: List[Literal["scores", "counts", "vep", "gnomad"]] The namespaces of all columns except for accession, hgvs_nt, hgvs_pro, and hgvs_splice. - We may add ClinVar and gnomAD in the future. + We may add ClinVar in the future. drop_na_columns : bool, optional Whether to drop columns that contain only NA values. Defaults to False. db : Session diff --git a/src/mavedb/scripts/clingen_ldh_submission.py b/src/mavedb/scripts/clingen_ldh_submission.py index e62bf229..94f16520 100644 --- a/src/mavedb/scripts/clingen_ldh_submission.py +++ b/src/mavedb/scripts/clingen_ldh_submission.py @@ -13,7 +13,7 @@ from mavedb.lib.clingen.services import ClinGenLdhService from mavedb.lib.clingen.constants import DEFAULT_LDH_SUBMISSION_BATCH_SIZE, LDH_SUBMISSION_ENDPOINT from mavedb.lib.clingen.content_constructors import construct_ldh_submission -from mavedb.lib.score_sets import get_hgvs_from_post_mapped +from mavedb.lib.variants import get_hgvs_from_post_mapped logger = logging.getLogger(__name__) diff --git a/src/mavedb/scripts/populate_mapped_hgvs.py b/src/mavedb/scripts/populate_mapped_hgvs.py new file mode 100644 index 00000000..ed60594c --- /dev/null +++ b/src/mavedb/scripts/populate_mapped_hgvs.py @@ -0,0 +1,188 @@ +import logging +import requests +from typing import Sequence, Optional + +import click +from sqlalchemy import select +from sqlalchemy.orm import Session + +from mavedb.lib.clingen.allele_registry import CLINGEN_API_URL +from mavedb.lib.logging.context import format_raised_exception_info_as_dict +from mavedb.lib.variants import get_hgvs_from_post_mapped + +from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.score_set import ScoreSet +from mavedb.models.variant import Variant + +from mavedb.scripts.environment import script_environment, with_database_session + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +def get_target_info(score_set: ScoreSet) -> tuple[bool, Optional[str]]: + target_is_coding: bool + transcript_accession: Optional[str] = None + if len(score_set.target_genes) == 1: + target = score_set.target_genes[0] + if target.category == "protein_coding": + target_is_coding = True + # only get transcript accession if coding + # accession-based + if target.target_accession and target.target_accession.accession: + # only use accession info if a transcript was specified + if target.target_accession.accession.startswith(("NM", "ENST")): + transcript_accession = target.target_accession.accession + # sequence-based + if target.post_mapped_metadata: + # assert that post_mapped_metadata is a dict for mypy + assert isinstance(target.post_mapped_metadata, dict) + if target.post_mapped_metadata.get("cdna", {}).get("sequence_accessions"): + if len(target.post_mapped_metadata["cdna"]["sequence_accessions"]) == 1: + transcript_accession = target.post_mapped_metadata["cdna"]["sequence_accessions"][0] + else: + raise ValueError( + f"Multiple cDNA accessions found in post-mapped metadata for target {target.name} in score set {score_set.urn}. Cannot determine which to use." + ) + # if sequence-based and no cDNA accession, warn that no transcript was specified + else: + # for coding score sets, the mapper should have returned a cdna post mapped metadata entry. Use mane transcript from clingen for now, but warn that we are assuming transcript. + logger.warning( + f"No cDNA accession found in post-mapped metadata for target {target.name} in score set {score_set.urn}. This is expected if variants were only provided at the protein level. If variants are at the nucleotide level, will assume MANE transcript from ClinGen for coding variant." + ) + else: + # for coding score sets, the mapper should have returned a cdna post mapped metadata entry. Use mane transcript from clingen for now, but warn that we are assuming transcript. + logger.warning( + f"No post-mapped metadata for target {target.name} in score set {score_set.urn}. Will assume MANE transcript from ClinGen for coding variant." + ) + else: + target_is_coding = False + # multi-target score sets are more complex because there is no direct link between variants and targets in the db. support later + else: + raise NotImplementedError("Populating mapped hgvs for multi-target score sets is not yet supported.") + + return target_is_coding, transcript_accession + + +@script_environment.command() +@with_database_session +@click.argument("urns", nargs=-1) +@click.option("--all", help="Populate mapped hgvs for every score set in MaveDB.", is_flag=True) +def populate_mapped_hgvs(db: Session, urns: Sequence[Optional[str]], all: bool): + score_set_ids: Sequence[Optional[int]] + if all: + score_set_ids = db.scalars(select(ScoreSet.id)).all() + logger.info(f"Command invoked with --all. Routine will populate mapped hgvs for {len(urns)} score sets.") + else: + score_set_ids = db.scalars(select(ScoreSet.id).where(ScoreSet.urn.in_(urns))).all() + logger.info(f"Populating mapped hgvs for the provided score sets ({len(urns)}).") + + for idx, ss_id in enumerate(score_set_ids): + if not ss_id: + continue + + score_set = db.scalar(select(ScoreSet).where(ScoreSet.id == ss_id)) + if not score_set: + logger.warning(f"Could not fetch score set with id={ss_id}.") + continue + + try: + target_is_coding, transcript_accession = get_target_info(score_set) + + variant_info = db.execute( + select(Variant.urn, MappedVariant) + .join(Variant) + .join(ScoreSet) + .where(ScoreSet.id == ss_id) + .where(MappedVariant.current == True) # noqa: E712 + ) + + variant_info_list = variant_info.all() + num_variants = len(variant_info_list) + + for v_idx, (variant_urn, mapped_variant) in enumerate(variant_info_list): + if (v_idx + 1) % ((num_variants + 9) // 10) == 0: + logger.info( + f"Processing variant {v_idx+1}/{num_variants} ({variant_urn}) for score set {score_set.urn} ({idx+1}/{len(urns)})." + ) + # TODO#469: support multi-target score sets + # returns None if no post-mapped object or if multi-variant + hgvs_assay_level = get_hgvs_from_post_mapped(mapped_variant.post_mapped) + + hgvs_g: Optional[str] = None + hgvs_c: Optional[str] = None + hgvs_p: Optional[str] = None + + # NOTE: if no clingen allele id, could consider searching clingen using hgvs_assay_level. for now, skipping variant if no clingen allele id in db + # TODO#469: implement support for multi-variants + if mapped_variant.clingen_allele_id and len(mapped_variant.clingen_allele_id.split(",")) == 1: + response = requests.get(f"{CLINGEN_API_URL}/{mapped_variant.clingen_allele_id}") + if response.status_code != 200: + logger.error( + f"Failed for variant {variant_urn} to query ClinGen API for {mapped_variant.clingen_allele_id}: {response.status_code}" + ) + continue + data = response.json() + if mapped_variant.clingen_allele_id.startswith("CA"): + if data.get("genomicAlleles"): + for allele in data["genomicAlleles"]: + if allele.get("referenceGenome") == "GRCh38" and allele.get("hgvs"): + hgvs_g = allele["hgvs"][0] + break + if target_is_coding: + if data.get("transcriptAlleles"): + if transcript_accession: + for allele in data["transcriptAlleles"]: + if allele.get("hgvs"): + for hgvs_string in allele["hgvs"]: + hgvs_reference_sequence = hgvs_string.split(":")[0] + if transcript_accession == hgvs_reference_sequence: + hgvs_c = hgvs_string + break + if hgvs_c: + if allele.get("proteinEffect"): + hgvs_p = allele["proteinEffect"].get("hgvs") + break + else: + # no transcript specified, use mane if available + for allele in data["transcriptAlleles"]: + if allele.get("MANE"): + # TODO#571 consider prioritizing certain MANE transcripts (e.g. MANE Select) + hgvs_c = allele["MANE"].get("nucleotide", {}).get("RefSeq", {}).get("hgvs") + hgvs_p = allele["MANE"].get("protein", {}).get("RefSeq", {}).get("hgvs") + break + + elif mapped_variant.clingen_allele_id.startswith("PA"): + # if PA, assume that assay was performed at amino acid level, so only provide hgvs_p + if data.get("aminoAcidAlleles"): + for allele in data["aminoAcidAlleles"]: + if allele.get("hgvs"): + hgvs_p = allele["hgvs"][0] + break + + mapped_variant.hgvs_assay_level = hgvs_assay_level + mapped_variant.hgvs_g = hgvs_g + mapped_variant.hgvs_c = hgvs_c + mapped_variant.hgvs_p = hgvs_p + db.add(mapped_variant) + db.commit() + + except Exception as e: + logging_context = { + "processed_score_sets": urns[:idx], + "unprocessed_score_sets": urns[idx:], + } + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + f"Score set {score_set.urn} could not be processed to extract hgvs strings.", extra=logging_context + ) + logger.info(f"Rolling back all changes for scoreset {score_set.urn}") + db.rollback() + + logger.info(f"Done with score set {score_set.urn}. ({idx+1}/{len(urns)}).") + + logger.info("Done populating mapped hgvs.") + + +if __name__ == "__main__": + populate_mapped_hgvs() diff --git a/src/mavedb/scripts/vep_functional_consequence.py b/src/mavedb/scripts/vep_functional_consequence.py new file mode 100644 index 00000000..8f188fa1 --- /dev/null +++ b/src/mavedb/scripts/vep_functional_consequence.py @@ -0,0 +1,268 @@ +import logging +import requests +from datetime import date +from typing import Sequence, Optional + +import click +from sqlalchemy import select +from sqlalchemy.orm import Session + +from mavedb.models.score_set import ScoreSet +from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.variant import Variant + +from mavedb.scripts.environment import script_environment, with_database_session + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +ENSEMBL_API_URL = "https://rest.ensembl.org" + +# List of all possible VEP consequences, in order from most to least severe +VEP_CONSEQUENCES = [ + "transcript_ablation", + "splice_acceptor_variant", + "splice_donor_variant", + "stop_gained", + "frameshift_variant", + "stop_lost", + "start_lost", + "transcript_amplification", + "feature_elongation", + "feature_truncation", + "inframe_insertion", + "inframe_deletion", + "missense_variant", + "protein_altering_variant", + "splice_donor_5th_base_variant", + "splice_region_variant", + "splice_donor_region_variant", + "splice_polypyrimidine_tract_variant", + "incomplete_terminal_codon_variant", + "start_retained_variant", + "stop_retained_variant", + "synonymous_variant", + "coding_sequence_variant", + "mature_miRNA_variant", + "5_prime_UTR_variant", + "3_prime_UTR_variant", + "non_coding_transcript_exon_variant", + "intron_variant", + "NMD_transcript_variant", + "non_coding_transcript_variant", + "coding_transcript_variant", + "upstream_gene_variant", + "downstream_gene_variant", + "TFBS_ablation", + "TFBS_amplification", + "TF_binding_site_variant", + "regulatory_region_ablation", + "regulatory_region_amplification", + "regulatory_region_variant", + "intergenic_variant", + "sequence_variant", +] + + +def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str]]: + """ + Takes a list of input HGVS strings, calls the Variant Recoder API, and returns a mapping from input HGVS strings + to a list of genomic HGVS strings. + """ + headers = {"Content-Type": "application/json", "Accept": "application/json"} + recoder_response = requests.post( + f"{ENSEMBL_API_URL}/variant_recoder/human", + headers=headers, + json={"ids": list(missing_hgvs)}, + ) + input_hgvs_to_recoded: dict[str, list[str]] = {} + if recoder_response.status_code == 200: + recoder_data = recoder_response.json() + for entry in recoder_data: + for variant, variant_data in entry.items(): + input_hgvs = variant_data.get("input") + if not input_hgvs: + continue + genomic_hgvs_list = [] + genomic_strings = variant_data.get("hgvsg") + if genomic_strings: + for genomic_hgvs in genomic_strings: + if genomic_hgvs.startswith("NC_"): + genomic_hgvs_list.append(genomic_hgvs) + if genomic_hgvs_list: + if input_hgvs in input_hgvs_to_recoded: + input_hgvs_to_recoded[input_hgvs].extend(genomic_hgvs_list) + else: + input_hgvs_to_recoded[input_hgvs] = genomic_hgvs_list + else: + logger.error( + f"Failed batch Variant Recoder API request: {recoder_response.status_code} {recoder_response.text}" + ) + return input_hgvs_to_recoded + + +def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, Optional[str]]: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + result: dict[str, Optional[str]] = {} + + # Batch POST to VEP + response = requests.post( + f"{ENSEMBL_API_URL}/vep/human/hgvs", + headers=headers, + json={"hgvs_notations": hgvs_strings}, + ) + + missing_hgvs = set(hgvs_strings) + if response.status_code == 200: + data = response.json() + # Map HGVS to consequence + for entry in data: + hgvs = entry.get("input") + most_severe_consequence = entry.get("most_severe_consequence") + if hgvs: + result[hgvs] = most_severe_consequence + missing_hgvs.discard(hgvs) + else: + logger.error(f"Failed batch VEP API request: {response.status_code} {response.text}") + + # Fallback for missing HGVS strings: batch POST to Variant Recoder + if missing_hgvs: + recoded_variants = run_variant_recoder(list(missing_hgvs)) + # Assign None for any missing_hgvs not present in recoder response + for hgvs_string in missing_hgvs: + if hgvs_string not in recoded_variants: + result[hgvs_string] = None + + # Collect all genomic HGVS strings for batch VEP request + all_recoded_hgvs = [] + for input_variant, recoded in recoded_variants.items(): + for variant in recoded: + all_recoded_hgvs.append(variant) + + # Run VEP in batches of 200 + vep_results: dict[str, str] = {} + for i in range(0, len(all_recoded_hgvs), 200): + batch = all_recoded_hgvs[i : i + 200] + vep_response = requests.post( + f"{ENSEMBL_API_URL}/vep/human/hgvs", + headers=headers, + json={"hgvs_notations": batch}, + ) + + if vep_response.status_code != 200: + logger.error(f"Failed batch VEP for genomic HGVS: {vep_response.status_code}") + continue + vep_data = vep_response.json() + for entry in vep_data: + recoded_input = entry.get("input") + most_severe_consequence = entry.get("most_severe_consequence") + if recoded_input and most_severe_consequence: + vep_results[recoded_input] = most_severe_consequence + + # For each original missing_hgvs, choose the most severe consequence among its genomic equivalents + for input_variant, recoded in recoded_variants.items(): + consequences = [] + for variant in recoded: + consequences.append(vep_results.get(variant)) + if consequences: + for consequence in VEP_CONSEQUENCES: + if consequence in consequences: + result[input_variant] = consequence + break + else: + result[input_variant] = None + else: + result[input_variant] = None + + return result + + +@script_environment.command() +@with_database_session +@click.argument("urns", nargs=-1) +@click.option("--all", help="Populate functional consequence predictions for every score set in MaveDB.", is_flag=True) +def populate_functional_consequences(db: Session, urns: Sequence[Optional[str]], all: bool): + score_set_ids: Sequence[Optional[int]] + if all: + score_set_ids = db.scalars(select(ScoreSet.id)).all() + logger.info( + f"Command invoked with --all. Routine will populate functional consequence predictions for {len(score_set_ids)} score sets." + ) + else: + score_set_ids = db.scalars(select(ScoreSet.id).where(ScoreSet.urn.in_(urns))).all() + logger.info( + f"Populating functional consequence predictions for the provided score sets ({len(score_set_ids)})." + ) + + for ss_id in score_set_ids: + if not ss_id: + continue + + score_set = db.scalar(select(ScoreSet).where(ScoreSet.id == ss_id)) + if not score_set: + logger.warning(f"Could not fetch score set with id={ss_id}.") + continue + + try: + mapped_variants = db.scalars( + select(MappedVariant) + .join(Variant) + .where( + Variant.score_set_id == ss_id, + MappedVariant.current.is_(True), + MappedVariant.post_mapped.isnot(None), + ) + ).all() + + if not mapped_variants: + logger.info(f"No mapped variant post-mapped objects found for score set {score_set.urn}.") + continue + + queue = [] + variant_map = {} + for mapped_variant in mapped_variants: + hgvs_string = mapped_variant.post_mapped.get("expressions", {})[0].get("value") # type: ignore + if not hgvs_string: + logger.warning(f"No HGVS string found in post_mapped for variant {mapped_variant.id}.") + continue + queue.append(hgvs_string) + variant_map[hgvs_string] = mapped_variant + + if len(queue) == 200: + consequences = get_functional_consequence(queue) + for hgvs, consequence in consequences.items(): + mapped_variant = variant_map[hgvs] + if consequence: + mapped_variant.vep_functional_consequence = consequence + mapped_variant.vep_access_date = date.today() + db.add(mapped_variant) + else: + logger.warning(f"Could not retrieve functional consequence for HGVS {hgvs}.") + db.commit() + queue.clear() + variant_map.clear() + + # Process any remaining variants in the queue + if queue: + consequences = get_functional_consequence(queue) + for hgvs, consequence in consequences.items(): + mapped_variant = variant_map[hgvs] + if consequence: + mapped_variant.vep_functional_consequence = consequence + mapped_variant.vep_access_date = date.today() + db.add(mapped_variant) + else: + logger.warning(f"Could not retrieve functional consequence for HGVS {hgvs}.") + db.commit() + + except Exception as e: + logger.error( + f"Failed to populate functional consequence predictions for score set {score_set.urn}: {str(e)}" + ) + db.rollback() + + logger.info("Done populating functional consequence predictions.") + + +if __name__ == "__main__": + populate_functional_consequences() diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 437caa64..0385837c 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -46,7 +46,6 @@ columns_for_dataset, create_variants, create_variants_data, - get_hgvs_from_post_mapped, ) from mavedb.lib.slack import log_and_send_slack_message, send_slack_error, send_slack_message from mavedb.lib.uniprot.constants import UNIPROT_ID_MAPPING_ENABLED @@ -56,6 +55,7 @@ validate_and_standardize_dataframe_pair, ) from mavedb.lib.validation.exceptions import ValidationError +from mavedb.lib.variants import get_hgvs_from_post_mapped from mavedb.models.enums.mapping_state import MappingState from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant @@ -1296,9 +1296,9 @@ async def link_clingen_variants(ctx: dict, correlation_id: str, score_set_id: in logging_context["linkage_failures"] = num_linkage_failures logging_context["linkage_successes"] = num_variant_urns - num_linkage_failures - assert len(linked_allele_ids) == num_variant_urns, ( - f"{num_variant_urns - len(linked_allele_ids)} appear to not have been attempted to be linked." - ) + assert ( + len(linked_allele_ids) == num_variant_urns + ), f"{num_variant_urns - len(linked_allele_ids)} appear to not have been attempted to be linked." job_succeeded = False if not linkage_failures: diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 517c26cb..1a219f17 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -1320,62 +1320,34 @@ TEST_MINIMAL_MAPPED_VARIANT_CREATE = {**TEST_MINIMAL_MAPPED_VARIANT, "clinical_controls": [], "gnomad_variants": []} -TEST_POST_MAPPED_VRS_WITH_HGVS_G_EXPRESSION = { - "id": "ga4gh:VA.fRW7u-kBQnAKitu1PoDMLvlECWZTHCos", - "type": "Allele", - "state": {"type": "LiteralSequenceExpression", "sequence": "G"}, - "digest": "fRW7u-kBQnAKitu1PoDMLvlECWZTHCos", - "location": { - "id": "ga4gh:SL.99b3WBaSSmaSTs6YmJfIhl1ZDCV07VZY", - "end": 23536836, - "type": "SequenceLocation", - "start": 23536835, - "digest": "99b3WBaSSmaSTs6YmJfIhl1ZDCV07VZY", - "sequenceReference": { - "type": "SequenceReference", - "label": "NC_000018.10", - "refgetAccession": "SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV", - }, - }, - "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "C"}], - "expressions": [{"value": "NC_000018.10:g.23536836C>G", "syntax": "hgvs.g"}], -} - -TEST_POST_MAPPED_VRS_WITH_HGVS_P_EXPRESSION = { - "id": "ga4gh:VA.zkOAzZK5qG0D0mkJUfXlK1aS075OGSjh", - "type": "Allele", - "state": {"type": "LiteralSequenceExpression", "sequence": "R"}, - "digest": "zkOAzZK5qG0D0mkJUfXlK1aS075OGSjh", - "location": { - "id": "ga4gh:SL.uUyRpJbrPttRThL7A2zeWAnTcb_7f1R2", - "end": 116, - "type": "SequenceLocation", - "start": 115, - "digest": "uUyRpJbrPttRThL7A2zeWAnTcb_7f1R2", - "sequenceReference": {"type": "SequenceReference", "refgetAccession": "SQ.StlJo3M4b8cS253ufe9nPpWqQHBDOSPs"}, - }, - "extensions": [{"name": "vrs_ref_allele_seq", "type": "Extension", "value": "Q"}], - "expressions": [{"value": "NP_002746.1:p.Gln116Arg", "syntax": "hgvs.p"}], -} - TEST_MAPPED_VARIANT_WITH_HGVS_G_EXPRESSION = { "pre_mapped": {}, - "post_mapped": TEST_POST_MAPPED_VRS_WITH_HGVS_G_EXPRESSION, + "post_mapped": {}, + "vep_functional_consequence": "missense_variant", "modification_date": datetime.isoformat(datetime.now()), "mapped_date": datetime.isoformat(datetime.now()), "current": True, "vrs_version": "2.0", "mapping_api_version": "pytest.0.0", + "hgvs_g": "NC_000018.10:g.23536836C>G", + "hgvs_p": "NP_000262.2:p.Gly1028Arg", + "hgvs_c": "NM_000271.5:c.3082G>C", + "hgvs_assay_level": "NC_000018.10:g.23536836C>G", } TEST_MAPPED_VARIANT_WITH_HGVS_P_EXPRESSION = { "pre_mapped": {}, - "post_mapped": TEST_POST_MAPPED_VRS_WITH_HGVS_P_EXPRESSION, + "post_mapped": {}, + "vep_functional_consequence": "missense_variant", "modification_date": datetime.isoformat(datetime.now()), "mapped_date": datetime.isoformat(datetime.now()), "current": True, "vrs_version": "2.0", "mapping_api_version": "pytest.0.0", + "hgvs_g": None, + "hgvs_p": "NP_002746.1:p.Gln116Arg", + "hgvs_c": None, + "hgvs_assay_level": "NP_002746.1:p.Gln116Arg", } TEST_BASELINE_SCORE = 1.0 diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py index c0d7748b..86234392 100644 --- a/tests/routers/test_score_set.py +++ b/tests/routers/test_score_set.py @@ -2710,6 +2710,8 @@ def test_download_variants_data_file( "hgvs_pro", "mavedb.post_mapped_hgvs_g", "mavedb.post_mapped_hgvs_p", + "mavedb.post_mapped_hgvs_c", + "mavedb.post_mapped_hgvs_at_assay_level", "mavedb.post_mapped_vrs_digest", "scores.score", ] @@ -2717,13 +2719,20 @@ def test_download_variants_data_file( rows = list(reader) for row in rows: if has_hgvs_g: - assert row["mavedb.post_mapped_hgvs_g"] == mapped_variant["post_mapped"]["expressions"][0]["value"] - else: + assert row["mavedb.post_mapped_hgvs_g"] == mapped_variant["hgvs_g"] + assert row["mavedb.post_mapped_hgvs_c"] == mapped_variant["hgvs_c"] + assert row["mavedb.post_mapped_hgvs_p"] == mapped_variant["hgvs_p"] + assert row["mavedb.post_mapped_hgvs_at_assay_level"] == mapped_variant["hgvs_assay_level"] + elif has_hgvs_p: assert row["mavedb.post_mapped_hgvs_g"] == "NA" - if has_hgvs_p: - assert row["mavedb.post_mapped_hgvs_p"] == mapped_variant["post_mapped"]["expressions"][0]["value"] + assert row["mavedb.post_mapped_hgvs_c"] == "NA" + assert row["mavedb.post_mapped_hgvs_p"] == mapped_variant["hgvs_p"] + assert row["mavedb.post_mapped_hgvs_at_assay_level"] == mapped_variant["hgvs_assay_level"] else: + assert row["mavedb.post_mapped_hgvs_g"] == "NA" + assert row["mavedb.post_mapped_hgvs_c"] == "NA" assert row["mavedb.post_mapped_hgvs_p"] == "NA" + assert row["mavedb.post_mapped_hgvs_at_assay_level"] == "NA" # Test file doesn't have hgvs_splice so its values are all NA. @@ -2874,8 +2883,10 @@ def test_download_scores_counts_and_post_mapped_variants_file( "accession", "hgvs_nt", "hgvs_pro", + "mavedb.post_mapped_hgvs_c", "mavedb.post_mapped_hgvs_g", "mavedb.post_mapped_hgvs_p", + "mavedb.post_mapped_hgvs_at_assay_level", "mavedb.post_mapped_vrs_digest", "scores.score", "scores.s_0",