diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index f1234de5..5b5fb740 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -1,17 +1,16 @@ -from collections import Counter import csv import io import logging -from operator import attrgetter import re -from typing import Any, BinaryIO, Iterable, List, Optional, TYPE_CHECKING, Sequence, Literal +from collections import Counter +from operator import attrgetter +from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Literal, Optional, Sequence -from mavedb.models.mapped_variant import MappedVariant import numpy as np import pandas as pd from pandas.testing import assert_index_equal from sqlalchemy import Integer, and_, cast, func, or_, select -from sqlalchemy.orm import Session, aliased, contains_eager, joinedload, Query, selectinload +from sqlalchemy.orm import Query, Session, aliased, contains_eager, joinedload, selectinload from mavedb.lib.exceptions import ValidationError from mavedb.lib.logging.context import logging_context, save_to_logging_context @@ -26,7 +25,7 @@ from mavedb.lib.mave.utils import is_csv_null from mavedb.lib.validation.constants.general import null_values_list from mavedb.lib.validation.utilities import is_null as validate_is_null -from mavedb.lib.variants import get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p +from mavedb.lib.variants import get_digest_from_post_mapped, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p from mavedb.models.contributor import Contributor from mavedb.models.controlled_keyword import ControlledKeyword from mavedb.models.doi_identifier import DoiIdentifier @@ -36,6 +35,7 @@ from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation from mavedb.models.experiment_set import ExperimentSet +from mavedb.models.mapped_variant import MappedVariant from mavedb.models.publication_identifier import PublicationIdentifier from mavedb.models.refseq_identifier import RefseqIdentifier from mavedb.models.refseq_offset import RefseqOffset @@ -547,6 +547,7 @@ def get_score_set_variants_as_csv( if include_post_mapped_hgvs: namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_g") namespaced_score_set_columns["mavedb"].append("post_mapped_hgvs_p") + namespaced_score_set_columns["mavedb"].append("post_mapped_vrs_digest") for namespace in namespaces: namespaced_score_set_columns[namespace] = [] if include_custom_columns: @@ -596,7 +597,9 @@ def get_score_set_variants_as_csv( if limit: variants_query = variants_query.limit(limit) variants = db.scalars(variants_query).all() - rows_data = variants_to_csv_rows(variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings) # type: ignore + rows_data = variants_to_csv_rows( + variants, columns=namespaced_score_set_columns, namespaced=namespaced, mappings=mappings + ) # type: ignore rows_columns = [ ( f"{namespace}.{col}" @@ -701,6 +704,9 @@ def variant_to_csv_row( value = hgvs_str else: value = "" + elif column_key == "post_mapped_vrs_digest": + digest = get_digest_from_post_mapped(mapping.post_mapped) if mapping and mapping.post_mapped else None + value = digest if digest is not None else "" if is_null(value): value = na_rep key = f"mavedb.{column_key}" if namespaced else column_key diff --git a/src/mavedb/lib/variants.py b/src/mavedb/lib/variants.py index e052df41..54258482 100644 --- a/src/mavedb/lib/variants.py +++ b/src/mavedb/lib/variants.py @@ -1,7 +1,6 @@ import re from typing import Any, Optional - HGVS_G_REGEX = re.compile(r"(^|:)g\.") HGVS_P_REGEX = re.compile(r"(^|:)p\.") @@ -48,6 +47,23 @@ def get_hgvs_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]: return variations_hgvs[0] +def get_digest_from_post_mapped(post_mapped_vrs: Optional[Any]) -> Optional[str]: + """ + Extract the digest value from a post-mapped VRS object. + + Args: + post_mapped_vrs: A post-mapped VRS (Variation Representation Specification) object + that may contain a digest field. Can be None. + + Returns: + The digest string if present in the post_mapped_vrs object, otherwise None. + """ + if not post_mapped_vrs: + return None + + return post_mapped_vrs.get("digest") # type: ignore + + # TODO (https://github.com/VariantEffect/mavedb-api/issues/440) Temporarily, we are using these functions to distinguish # genomic and protein HGVS strings produced by the mapper. Using hgvs.parser.Parser is too slow, and we won't need to do # this once the mapper extracts separate g., c., and p. post-mapped HGVS strings. diff --git a/tests/lib/test_variants.py b/tests/lib/test_variants.py index 9c9f4ec5..ca9c2b0b 100644 --- a/tests/lib/test_variants.py +++ b/tests/lib/test_variants.py @@ -1,15 +1,22 @@ import pytest -from mavedb.lib.variants import hgvs_from_vrs_allele, get_hgvs_from_post_mapped, is_hgvs_g, is_hgvs_p - +from mavedb.lib.variants import ( + get_digest_from_post_mapped, + get_hgvs_from_post_mapped, + hgvs_from_vrs_allele, + is_hgvs_g, + is_hgvs_p, +) from tests.helpers.constants import ( TEST_HGVS_IDENTIFIER, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS1_X, TEST_VALID_POST_MAPPED_VRS_ALLELE_VRS2_X, - TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE, TEST_VALID_POST_MAPPED_VRS_CIS_PHASED_BLOCK, + TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE, ) +### Tests for hgvs_from_vrs_allele function ### + def test_hgvs_from_vrs_allele_vrs_1(): with pytest.raises(ValueError): @@ -26,6 +33,9 @@ def test_hgvs_from_vrs_allele_invalid(): hgvs_from_vrs_allele({"invalid_key": "invalid_value"}) +### Tests for get_hgvs_from_post_mapped function ### + + def test_get_hgvs_from_post_mapped_haplotype(): with pytest.raises(ValueError): get_hgvs_from_post_mapped(TEST_VALID_POST_MAPPED_VRS_HAPLOTYPE) @@ -61,6 +71,36 @@ def test_get_hgvs_from_post_mapped_invalid_structure(): get_hgvs_from_post_mapped({"invalid_key": "InvalidType"}) +### Tests for get_digest_from_post_mapped function ### + + +def test_get_digest_from_post_mapped_with_digest(): + post_mapped_vrs = {"digest": "test_digest_value", "type": "Allele"} + result = get_digest_from_post_mapped(post_mapped_vrs) + assert result == "test_digest_value" + + +def test_get_digest_from_post_mapped_without_digest(): + post_mapped_vrs = {"type": "Allele", "other_field": "value"} + + result = get_digest_from_post_mapped(post_mapped_vrs) + + assert result is None + + +def test_get_digest_from_post_mapped_none_input(): + result = get_digest_from_post_mapped(None) + assert result is None + + +def test_get_digest_from_post_mapped_empty_dict(): + result = get_digest_from_post_mapped({}) + assert result is None + + +### Tests for is_hgvs_g and is_hgvs_p functions ### + + @pytest.mark.parametrize( "hgvs,expected", [ diff --git a/tests/routers/test_score_set.py b/tests/routers/test_score_set.py index d1f960e2..98222d0c 100644 --- a/tests/routers/test_score_set.py +++ b/tests/routers/test_score_set.py @@ -2738,6 +2738,7 @@ def test_download_variants_data_file( "hgvs_pro", "mavedb.post_mapped_hgvs_g", "mavedb.post_mapped_hgvs_p", + "mavedb.post_mapped_vrs_digest", "scores.score", ] ) @@ -2862,16 +2863,7 @@ def test_download_scores_and_counts_file(session, data_provider, client, setup_r download_scores_and_counts_csv = download_scores_and_counts_csv_response.text reader = csv.DictReader(StringIO(download_scores_and_counts_csv)) assert sorted(reader.fieldnames) == sorted( - [ - "accession", - "hgvs_nt", - "hgvs_pro", - "scores.score", - "scores.s_0", - "scores.s_1", - "counts.c_0", - "counts.c_1" - ] + ["accession", "hgvs_nt", "hgvs_pro", "scores.score", "scores.s_0", "scores.s_1", "counts.c_0", "counts.c_1"] ) @@ -2885,7 +2877,7 @@ def test_download_scores_and_counts_file(session, data_provider, client, setup_r ids=["without_post_mapped_vrs", "with_post_mapped_hgvs_g", "with_post_mapped_hgvs_p"], ) def test_download_scores_counts_and_post_mapped_variants_file( - session, data_provider, client, setup_router_db, data_files, mapped_variant, has_hgvs_g, has_hgvs_p + session, data_provider, client, setup_router_db, data_files, mapped_variant, has_hgvs_g, has_hgvs_p ): experiment = create_experiment(client) score_set = create_seq_score_set(client, experiment["urn"]) @@ -2912,11 +2904,12 @@ def test_download_scores_counts_and_post_mapped_variants_file( "hgvs_pro", "mavedb.post_mapped_hgvs_g", "mavedb.post_mapped_hgvs_p", + "mavedb.post_mapped_vrs_digest", "scores.score", "scores.s_0", "scores.s_1", "counts.c_0", - "counts.c_1" + "counts.c_1", ] )